From accc177cd682ff0935a442d58e4f84f31f7692b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 16 Jan 2013 16:34:06 +0100 Subject: [PATCH] initial commit --- .gitignore | 2 + README | 75 +++++++++++++ TODO | 24 +++++ filesystem/etc/rc.local | 29 +++++ filesystem/opt/groomer/groomer.sh | 159 ++++++++++++++++++++++++++++ filesystem/opt/groomer/sedKillSpace | 1 + 6 files changed, 290 insertions(+) create mode 100644 .gitignore create mode 100644 README create mode 100644 TODO create mode 100755 filesystem/etc/rc.local create mode 100755 filesystem/opt/groomer/groomer.sh create mode 100644 filesystem/opt/groomer/sedKillSpace diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c7c406 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +filesystem/opt/groomer/pdfbox-app-*.jar +filesystem/etc diff --git a/README b/README new file mode 100644 index 0000000..9eb26b6 --- /dev/null +++ b/README @@ -0,0 +1,75 @@ +How To +====== + +0. power off device +1. insert source USB in the TOP usb slot +2. insert target USB in the BOTTOM usb slot +3. wait. wait some more. it's slow and can take 30-60 minutes depending on how + many document conversions take place +4. when the only status LED left is the power indicator on the rPI, the process + is finished +5. power off the device and disconnect the drives + +Notes +===== + +* don't plug in USB devices with a hub because there's no way to tell it which + is source and target - its the first drive enumerated (top port) that is the + source and the second (bootom port) is the target +* don't turn it off without shuting down the system, when grooming is done it + shuts down automatically: losing power while it's running can trash the OS + on the SD cards because SD cards don't always like dirty shutdowns (ie power loss) +* Using a target usb stick that has a status light as long as the device has + power is a really useful thing as there the other status lights on the groomer + are less than indicative at times: because teh 'OK' led on the rPi toggles on activity + it can be off for a long time while processing something and only comes back + on when that process finishes - hence why a USB that has some sort of LED activity + when jsut plugged in (even if not reading or writing but while the USB port is + powered) is helpful in determining when the process is finished - when + teh rPI is shutdown, the USB port power is shut off and that LED will also + then be off on the USB device +* Use a larger target device as all zip files get unpacked and processed onto + the target +* if you have an hdmi monitor plugged in you can watch what's happening for about + 30 mintues until the rPI's power saving's kick in and turn off the monitor +* if only one usb stick is present at power up, it doesn't groom and looks like + a normal rPi +* if you want to ssh into the rPi username is 'pi' password 'raspberry' as per defaults + + +Technical notes +=============== + +* groomer script is in /opt/groomer/ with the other required files +* dependancies are libre-office and OpenJRE +* and the ip address is 192.168.1.89 +* the groomer process is kicked off in /etc/rc.local +* the heavy lifting takes place or is dispatched from /opt/groomer/groomer.sh + in that script file is what file types get processed (or if not listed there, + get ignored) +* there are two ways pdf's can get handled -right now they have their text extracted + to the target device, the otherway copies it and extracts the text +* the pdf text extraction isn't perfect and is the slowest part of it, but should + be able to handle unicode stuff and currently doesn't do image extraction from + pdf's but could do that too + + +Discussion +========== + +* however image exports of pdf pages only have the images and no text so it's not + like saving each page to a jpg which would be a really handy and safe way of + converting pdf's +* spread sheets and presentations get converted to pdfs to kill off any embedded + macros and it's assumed that it's not producing evil pdf's on export but does + nothing to sanitize any embedded links within those documents +* for spreadsheets, if they are longer than a page, only a page worth from that + sheet is exported right from the middle of the sheet (ie the top and bottom of + that sheet will get cut off and only the contents in teh middle exported to pdf) + dumb but i figure if you want to go back to the source because it's interesting + enough on teh groomed side of it, then you can take the extra precautions +* the groomed target only copies "safe" files, and does its best to convert any + potentiall unsafe files to a safer format +* safe files being one that i know of that can't contain malicious embedded macros + or other crap like that, and those than can get converted to something that wont + contain code after conversion diff --git a/TODO b/TODO new file mode 100644 index 0000000..ff4de54 --- /dev/null +++ b/TODO @@ -0,0 +1,24 @@ +TODO +==== + +* the script locations should be changed in the next version so they don't sit + next to teh rPi's example development code that ships with teh stock rPi +* the system isn't optimised and should be : cleanup and making it as close to + stock as possible +* Starting process should be more obfuscated +* strip exif data and leave it in a .txt file next to the image it came from + => exiftool +* set filesystem of OS in RO (physical switch and/or remount OS) +* mount source key in RO and noexec +* mount target key with noexec +* convert spreadsheets in csv ? +* convert documents (pdfs/*office/...) in images ? +* Have a look at Ghostscript to work on PDFs (.pdf -> .eps -> .png?) +* do not run the conversions as root +* take eth0 down in /etc/netowrk/interfaces or in the groomer script disable the + interface before anything happens +* hdmi should stay up: solveable by poking the power management timer + (better not to disable the PM completely) +* get rid of pdfbox ? +* scripts to generate a SD card automatically (win/mac/linux) +* move the scripts away from /opt/ diff --git a/filesystem/etc/rc.local b/filesystem/etc/rc.local new file mode 100755 index 0000000..1fc5435 --- /dev/null +++ b/filesystem/etc/rc.local @@ -0,0 +1,29 @@ +#!/bin/sh -e +# +# rc.local +# +# This script is executed at the end of each multiuser runlevel. +# Make sure that the script will "exit 0" on success or any other +# value on error. +# +# In order to enable or disable this script just change the execution +# bits. +# +# By default this script does nothing. + +# Print the IP address +_IP=$(hostname -I) || true +if [ "$_IP" ]; then + printf "My IP address is %s\n" "$_IP" +fi + +PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +if [ -e /dev/sda ]; then + if [ -e /dev/sdb ]; then + /opt/groomer/groomer.sh + /sbin/shutdown -h now + fi +fi + +exit 0 diff --git a/filesystem/opt/groomer/groomer.sh b/filesystem/opt/groomer/groomer.sh new file mode 100755 index 0000000..d58fbb7 --- /dev/null +++ b/filesystem/opt/groomer/groomer.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# groom da kitteh! + +GH=/opt/groomer/ +JAVA=/usr/bin/java + +pdfCopyDirty() +{ + # copy all pdf's over to their relative same locations + find $1 -iname "*.pdf" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${2}$X/$F \n' | while read l; do eval $l; done + # extract all the txt we can from potentially evil pdf's + find $2 -iname "*.pdf" -printf 'echo %p extracting text to %p-extracted.txt; $JAVA -jar $GH/pdfbox-app-1.7.1.jar ExtractText %p %p-extracted.txt 2> /dev/null \n' | while read l; do eval $l; done +} + +pdfCopyClean() +{ + # convert pdf's on the fly from src to relative dst location + find $1 -iname "*.pdf" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; echo "%p" extracting text to ${2}$X/$F-extracted.txt; $JAVA -jar $GH/pdfbox-app-1.7.1.jar ExtractText "%p" ${2}$X/$F-extracted.txt 2> /dev/null \n' | while read l; do eval $l; done +} + +copySafeFiles() +{ + TYPES="\ + jpg jpeg gif png tif tga raw \ + mp4 avi mov \ + mp3 wav \ + txt xml csv tsv \ + " + for type in $TYPES + do + find $1 -iname "*.$type" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${2}$X/$F \n' | while read l; do eval $l; done + done +} + +convertCopyFiles() +{ + # wordy documents + TYPES="doc docx odt sxw rtf wpd htm html" + FILTER=Text; OUT=txt + convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER + + # spreadsheets + TYPES="xls xslx ods sxc" + FILTER=calc_pdf_Export; OUT=pdf + convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER + + # presentation files + TYPES="ppt pptx odp sxi" + FILTER=impress_pdf_Export; OUT=pdf + convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER +} +convertCopyFilesHelper() +{ + for type in $TYPES + do + find $1 -iname "*.$type" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${3}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${3}$X/$F \n' | while read l; do eval $l; done + find $3 -iname "*.$type" -printf 'X=`echo %h | sed s:${3}::`; mkdir -p ${2}${X}; soffice --headless --convert-to ${type}-extraced.$OUT:$FILTER %p --outdir ${2}${X} \n' | while read l; do eval $l; done + done +} + +unpackZip() +{ + find $1 -iname "*.zip" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${3}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${3}$X/$F \n' | while read l; do eval $l; done + find $3 -iname "*.zip" -printf 'X=`echo %h | sed s:${3}::`; mkdir -p ${ZIPTEMP}/${X}/UNZIPPED_%f/; unzip "%p" -d ${ZIPTEMP}${X}/UNZIPPED_%f/ 2> /dev/null; \n' | while read l; do eval $l; done + find $3 -iname "*.zip" -printf 'rm -rf %p \n' | while read l; do eval $l; done + + if [ -d ${ZIPTEMP} ]; then + if [ $COPYDIRTYPDF -eq 1 ]; then + pdfCopyDirty $ZIPTEMP $targetDir + else + pdfCopyClean $ZIPTEMP $targetDir + fi + copySafeFiles $ZIPTEMP $2 $3 + convertCopyFiles $ZIPTEMP $2 $3 + rm -rf ${TEMP}/* + rm -rf ${ZIPTEMP}/* + fi +} + +SRC=/src +DST=/dst +if [ ! -d $SRC ]; then + mkdir $SRC +fi +if [ ! -d $DST ]; then + mkdir $DST +fi + +TEMP=/dst/temp +ZIPTEMP=/dst/ziptemp +FL=${DST}/filelist.txt + +umount $DST 2> /dev/null +mount /dev/sdb1 $DST +if [ $? -ne 0 ]; then +# echo Could not mount target USB stick! + exit 1 +else + echo Target USB device mounted at $DST + rm -rf $DST/FROM_PARTITION_* + + # mount temp and make sure it's empty + mkdir -p $TEMP + mkdir -p $ZIPTEMP + + rm -rf ${TEMP}/* + rm -rf ${ZIPTEMP}/* + + echo Full file list from source USB > $FL +fi + +COPYDIRTYPDF=0 +PARTCOUNT=1 +PARTITIONS=`ls /dev/sda* | grep '/dev/sda[1-9][0-6]*'` +for partition in $PARTITIONS +do + echo Processing partition: ${PARTCOUNT} $partition + umount $SRC 2> /dev/null + mount -r $partition $SRC + if [ $? -ne 0 ]; then + echo could not mount $partition at /$SRC + else + echo $partition mounted at $SRC + + echo PARTITION $PARTCOUNT >> $FL + find $SRC/* -printf 'echo %p | sed s:$SRC:: >> $FL \n' | while read l; do eval $l; done + + # create a director on sdb named PARTION_n + targetDir=${DST}/FROM_PARTITION_${PARTCOUNT} + echo copying to: $targetDir + mkdir -p $targetDir + + if [ $COPYDIRTYPDF -eq 1 ]; then + pdfCopyDirty $SRC $targetDir + else + pdfCopyClean $SRC $targetDir + fi + + # copy stuff + copySafeFiles $SRC $targetDir + convertCopyFiles $SRC $targetDir $TEMP + rm -rf ${TEMP}/* + + # unpack and process archives + unpackZip $SRC $targetDir $TEMP + fi + let PARTCOUNT=$PARTCOUNT+1 +done + +#cleanup +rm -rf ${TEMP}* +rm -rf ${ZIPTEMP}* +sync +umount $SRC +umount $DST + +/sbin/shutdown -h now + diff --git a/filesystem/opt/groomer/sedKillSpace b/filesystem/opt/groomer/sedKillSpace new file mode 100644 index 0000000..63f4ad0 --- /dev/null +++ b/filesystem/opt/groomer/sedKillSpace @@ -0,0 +1 @@ +s:\ :_:g