mirror of https://github.com/CIRCL/PyCIRCLean
Move non-filecheck.py binaries into examples directory
Tests for these scripts also removed from /tests and from .travis.yml Two .zip archives accidentally deleted from /tests/src_invalid, re-added them and changed .gitignore to prevent the problempull/8/head
parent
f7ab393eb6
commit
21cc175867
|
@ -67,8 +67,8 @@ target/
|
||||||
*.vrb
|
*.vrb
|
||||||
|
|
||||||
# Project specific
|
# Project specific
|
||||||
/tests/dst/*
|
tests/dst/*
|
||||||
!/tests/logs/
|
tests/test_logs/*
|
||||||
!/tests/.keepdir
|
!tests/**/.keepdir
|
||||||
|
!tests/src_invalid/*
|
||||||
|
!tests/src_valid/*
|
||||||
|
|
20
.travis.yml
20
.travis.yml
|
@ -17,8 +17,6 @@ addons:
|
||||||
packages:
|
packages:
|
||||||
# General dependencies
|
# General dependencies
|
||||||
- p7zip-full
|
- p7zip-full
|
||||||
# generic.py dependencies
|
|
||||||
- ghostscript
|
|
||||||
# Testing dependencies
|
# Testing dependencies
|
||||||
- mercurial
|
- mercurial
|
||||||
|
|
||||||
|
@ -26,21 +24,7 @@ install:
|
||||||
# General dependencies
|
# General dependencies
|
||||||
- sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ trusty multiverse" && sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ trusty-updates multiverse"
|
- sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ trusty multiverse" && sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ trusty-updates multiverse"
|
||||||
- sudo apt-get update -qq
|
- sudo apt-get update -qq
|
||||||
- sudo apt-get install -y p7zip-rar
|
- sudo apt-get install -y p7zip-rar python-pip
|
||||||
# generic.py: pdf2htmlEX + dependencies
|
|
||||||
- sudo add-apt-repository ppa:fontforge/fontforge --yes
|
|
||||||
# to get a working 0.26 poppler
|
|
||||||
- sudo add-apt-repository ppa:delayargentina/delayx --yes
|
|
||||||
- sudo apt-get update -qq
|
|
||||||
- sudo apt-get install -y libpoppler-dev libpoppler-private-dev libspiro-dev libcairo-dev libpango1.0-dev libfreetype6-dev libltdl-dev libfontforge-dev python-imaging python-pip firefox xvfb
|
|
||||||
- git clone https://github.com/coolwanglu/pdf2htmlEX.git
|
|
||||||
- pushd pdf2htmlEX
|
|
||||||
- cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr -DENABLE_SVG=ON .
|
|
||||||
- make
|
|
||||||
- sudo make install
|
|
||||||
- popd
|
|
||||||
# generic.py: Other dependencies
|
|
||||||
- sudo apt-get install -y libreoffice libreoffice-script-provider-python unoconv
|
|
||||||
# filecheck.py dependencies
|
# filecheck.py dependencies
|
||||||
- sudo apt-get install libxml2-dev libxslt1-dev
|
- sudo apt-get install libxml2-dev libxslt1-dev
|
||||||
- wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip
|
- wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip
|
||||||
|
@ -82,7 +66,7 @@ install:
|
||||||
- wget --no-check-certificate https://www.officedissector.com/corpus/fraunhoferlibrary.zip
|
- wget --no-check-certificate https://www.officedissector.com/corpus/fraunhoferlibrary.zip
|
||||||
- unzip -o fraunhoferlibrary.zip
|
- unzip -o fraunhoferlibrary.zip
|
||||||
- rm fraunhoferlibrary.zip
|
- rm fraunhoferlibrary.zip
|
||||||
- 7z x 42.zip -p42
|
- 7z x -p42 42.zip
|
||||||
- wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3
|
- wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3
|
||||||
- wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4
|
- wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4
|
||||||
- wget http://thewalter.net/stef/software/rtfx/sample.rtf
|
- wget http://thewalter.net/stef/software/rtfx/sample.rtf
|
||||||
|
|
|
@ -1,25 +1,18 @@
|
||||||
Examples
|
|
||||||
========
|
|
||||||
|
|
||||||
These are several sanitizers that demonstrate PyCIRCLean's capabilities. Feel free to
|
|
||||||
adapt or modify any of them to suit your requirements. In order to use any of these scripts,
|
|
||||||
you will first need to install the PyCIRCLean dependencies (preferably in a virtualenv):
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install .
|
|
||||||
```
|
|
||||||
|
|
||||||
Requirements per script
|
|
||||||
=======================
|
|
||||||
|
|
||||||
filecheck.py
|
filecheck.py
|
||||||
------------
|
============
|
||||||
|
|
||||||
This is the script used by the [CIRCLean](https://github.com/CIRCL/Circlean)
|
This is the script used by the [CIRCLean](https://github.com/CIRCL/Circlean)
|
||||||
USB key sanitizer. It is designed to handle a range of file types, and will
|
USB key sanitizer. It is designed to handle a range of file types, and will
|
||||||
mark them as dangerous if they meet certain criteria.
|
mark them as dangerous if they meet certain criteria.
|
||||||
|
|
||||||
Requirements by type of document:
|
Before installing the filecheck.py depenencies, make sure to install the PyCIRCLean
|
||||||
|
dependencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
Dependencies by type of document:
|
||||||
* Microsoft office: oletools, olefile
|
* Microsoft office: oletools, olefile
|
||||||
* OOXML: officedissector
|
* OOXML: officedissector
|
||||||
* PDF: pdfid
|
* PDF: pdfid
|
||||||
|
@ -38,47 +31,3 @@ manually in the directory where filecheck will be run.
|
||||||
wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip
|
wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip
|
||||||
unzip pdfid_v0_2_1.zip
|
unzip pdfid_v0_2_1.zip
|
||||||
```
|
```
|
||||||
|
|
||||||
generic.py
|
|
||||||
----------
|
|
||||||
|
|
||||||
This is a script used by an older version of CIRCLean. It has more dependencies
|
|
||||||
than filecheck.py and they are more complicated to install.
|
|
||||||
|
|
||||||
Requirements by type of document:
|
|
||||||
* Office and all text files: unoconv, libreoffice
|
|
||||||
* PDF: ghostscript, pdf2htmlEX
|
|
||||||
|
|
||||||
```
|
|
||||||
# required for pdf2htmlEX
|
|
||||||
sudo add-apt-repository ppa:fontforge/fontforge --yes
|
|
||||||
sudo add-apt-repository ppa:coolwanglu/pdf2htmlex --yes
|
|
||||||
sudo apt-get update -qq
|
|
||||||
sudo apt-get install -qq libpoppler-dev libpoppler-private-dev libspiro-dev libcairo-dev libpango1.0-dev libfreetype6-dev libltdl-dev libfontforge-dev python-imaging python-pip firefox xvfb
|
|
||||||
# install pdf2htmlEX
|
|
||||||
git clone https://github.com/coolwanglu/pdf2htmlEX.git
|
|
||||||
pushd pdf2htmlEX
|
|
||||||
cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr -DENABLE_SVG=ON .
|
|
||||||
make
|
|
||||||
sudo make install
|
|
||||||
popd
|
|
||||||
# Installing the rest
|
|
||||||
sudo apt-get install ghostscript p7zip-full p7zip-rar libreoffice unoconv
|
|
||||||
```
|
|
||||||
|
|
||||||
pier9.py
|
|
||||||
--------
|
|
||||||
|
|
||||||
This script has a list of file formats for various brands of industrial
|
|
||||||
manufacturing equipment, such as 3d printers, CNC machines, etc. It only
|
|
||||||
copies files that match these file formats.
|
|
||||||
|
|
||||||
No external dependencies required.
|
|
||||||
|
|
||||||
specific.py
|
|
||||||
-----------
|
|
||||||
|
|
||||||
As the name suggests, this script copies only specific file formats according
|
|
||||||
to the configuration provided by the user.
|
|
||||||
|
|
||||||
No external dependencies required.
|
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
Examples
|
||||||
|
========
|
||||||
|
|
||||||
|
These are several sanitizers that demonstrate PyCIRCLean's capabilities. Feel free to
|
||||||
|
adapt or modify any of them to suit your requirements. In order to use any of these scripts,
|
||||||
|
you will first need to install the PyCIRCLean dependencies (preferably in a virtualenv):
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
Requirements per script
|
||||||
|
=======================
|
||||||
|
|
||||||
|
generic.py
|
||||||
|
----------
|
||||||
|
|
||||||
|
This is a script that was used by an older version of CIRCLean.
|
||||||
|
|
||||||
|
Requirements by type of document:
|
||||||
|
* Office and all text files: unoconv, libreoffice
|
||||||
|
* PDF: ghostscript, pdf2htmlEX
|
||||||
|
|
||||||
|
```
|
||||||
|
# required for pdf2htmlEX
|
||||||
|
sudo add-apt-repository ppa:fontforge/fontforge --yes
|
||||||
|
sudo add-apt-repository ppa:coolwanglu/pdf2htmlex --yes
|
||||||
|
sudo apt-get update -qq
|
||||||
|
sudo apt-get install -qq libpoppler-dev libpoppler-private-dev libspiro-dev libcairo-dev libpango1.0-dev libfreetype6-dev libltdl-dev libfontforge-dev python-imaging python-pip firefox xvfb
|
||||||
|
# install pdf2htmlEX
|
||||||
|
git clone https://github.com/coolwanglu/pdf2htmlEX.git
|
||||||
|
pushd pdf2htmlEX
|
||||||
|
cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr -DENABLE_SVG=ON .
|
||||||
|
make
|
||||||
|
sudo make install
|
||||||
|
popd
|
||||||
|
# Installing the rest
|
||||||
|
sudo apt-get install ghostscript p7zip-full p7zip-rar libreoffice unoconv
|
||||||
|
```
|
||||||
|
|
||||||
|
pier9.py
|
||||||
|
--------
|
||||||
|
|
||||||
|
This script contains a list of file formats for various brands of industrial
|
||||||
|
manufacturing equipment, such as 3d printers, CNC machines, etc. It only
|
||||||
|
copies files that match these file formats.
|
||||||
|
|
||||||
|
No external dependencies required.
|
||||||
|
|
||||||
|
specific.py
|
||||||
|
-----------
|
||||||
|
|
||||||
|
As the name suggests, this script copies only specific file formats according
|
||||||
|
to the configuration provided by the user.
|
||||||
|
|
||||||
|
No external dependencies required.
|
3
setup.py
3
setup.py
|
@ -12,9 +12,6 @@ setup(
|
||||||
description='Standalone CIRCLean/KittenGroomer code.',
|
description='Standalone CIRCLean/KittenGroomer code.',
|
||||||
packages=['kittengroomer'],
|
packages=['kittengroomer'],
|
||||||
scripts=[
|
scripts=[
|
||||||
'bin/generic.py',
|
|
||||||
'bin/pier9.py',
|
|
||||||
'bin/specific.py',
|
|
||||||
'bin/filecheck.py'
|
'bin/filecheck.py'
|
||||||
],
|
],
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -1,50 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from bin.generic import KittenGroomer, File, main
|
|
||||||
from tests.logging import save_logs
|
|
||||||
|
|
||||||
skipif_nodeps = pytest.mark.skipif(os.path.exists('/usr/bin/unoconv') is False,
|
|
||||||
reason="Dependencies aren't installed")
|
|
||||||
|
|
||||||
|
|
||||||
@skipif_nodeps
|
|
||||||
class TestIntegration:
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def src_valid(self):
|
|
||||||
return os.path.join(os.getcwd(), 'tests/src_valid')
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def src_invalid(self):
|
|
||||||
return os.path.join(os.getcwd(), 'tests/src_invalid')
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dst(self):
|
|
||||||
return os.path.join(os.getcwd(), 'tests/dst')
|
|
||||||
|
|
||||||
def test_generic(self, src_valid, dst):
|
|
||||||
groomer = KittenGroomer(src_valid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'generic_valid'
|
|
||||||
save_logs(groomer, test_description)
|
|
||||||
|
|
||||||
def test_generic_2(self, src_invalid, dst):
|
|
||||||
groomer = KittenGroomer(src_invalid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'generic_invalid'
|
|
||||||
save_logs(groomer, test_description)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileHandling:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# We're going to give KittenGroomer a bunch of files, and it's going to process them
|
|
||||||
# Maybe we want to make a function that processdir delegates to? Or is it just the File Object that's responsible?
|
|
||||||
# Ideally we should be able to pass a path to a function and have it do stuff? And then we can test that function?
|
|
||||||
# So we have a function that takes a path and returns...log info? That makes sense actually. Or some sort of meta data
|
|
||||||
# The function could maybe be called processfile
|
|
|
@ -1,53 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from bin.specific import KittenGroomerSpec
|
|
||||||
from bin.pier9 import KittenGroomerPier9
|
|
||||||
from tests.logging import save_logs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def src_valid():
|
|
||||||
return os.path.join(os.getcwd(), 'tests/src_valid')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def src_invalid():
|
|
||||||
return os.path.join(os.getcwd(), 'tests/src_invalid')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dst():
|
|
||||||
return os.path.join(os.getcwd(), 'tests/dst')
|
|
||||||
|
|
||||||
|
|
||||||
def test_specific_valid(src_valid, dst):
|
|
||||||
groomer = KittenGroomerSpec(src_valid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'specific_valid'
|
|
||||||
save_logs(groomer, test_description)
|
|
||||||
|
|
||||||
|
|
||||||
def test_specific_invalid(src_invalid, dst):
|
|
||||||
groomer = KittenGroomerSpec(src_invalid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'specific_invalid'
|
|
||||||
save_logs(groomer, test_description)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pier9_valid(src_invalid, dst):
|
|
||||||
groomer = KittenGroomerPier9(src_invalid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'pier9_valid'
|
|
||||||
save_logs(groomer, test_description)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pier9_invalid(src_invalid, dst):
|
|
||||||
groomer = KittenGroomerPier9(src_invalid, dst, debug=True)
|
|
||||||
groomer.processdir()
|
|
||||||
test_description = 'pier9_invalid'
|
|
||||||
save_logs(groomer, test_description)
|
|
Loading…
Reference in New Issue