chg: [feed] Use less memory when parsing CSV feeds

pull/6115/head
Jakub Onderka 2020-07-14 10:28:31 +02:00
parent 3df94f868f
commit c7d17cad9d
3 changed files with 142 additions and 41 deletions

View File

@ -1,4 +1,5 @@
<?php
require_once __DIR__ . '/TmpFileTool.php';
class ComplexTypeTool
{
@ -140,31 +141,28 @@ class ComplexTypeTool
return array_values($array);
}
/*
* parse a CSV file with the given settings
/**
* Parse a CSV file with the given settings
* All lines starting with # are stripped
* The settings can contain the following:
* delimiter: Expects a delimiter string (default is a simple comma).
* For example, to split the following line: "value1##comma##value2" simply pass $settings['delimiter'] = "##comma##";
* values: Expects an array (or a comma separated string) with numeric values denoting the columns containing indicators. If this is not set then every value will be checked. (column numbers start at 1)
* @param string $input
* @param array $settings
* @return array
* @throws Exception
*/
public function checkCSV($input, $settings = array())
{
$delimiter = !empty($settings['delimiter']) ? $settings['delimiter'] : ",";
$rows = str_getcsv($input, "\n");
unset($input);
$data = array();
foreach ($rows as $k => $row) {
if (empty($row[0]) || $row[0] === '#') {
continue;
}
if ($delimiter == '\t') {
$data[$k] = explode("\t", $row);
} else {
$data[$k] = str_getcsv($row, $delimiter);
}
if (empty($input)) {
return [];
}
$delimiter = !empty($settings['delimiter']) ? $settings['delimiter'] : ",";
if ($delimiter === '\t') {
$delimiter = "\t";
}
unset($rows);
$values = !empty($settings['value']) ? $settings['value'] : array();
if (!is_array($values)) {
$values = explode(',', $values);
@ -172,23 +170,34 @@ class ComplexTypeTool
foreach ($values as $key => $value) {
$values[$key] = intval($value);
}
$iocArray = array();
foreach ($data as $rowPos => $row) {
// Write to tmp file to save memory
$tmpFile = new TmpFileTool();
$tmpFile->write($input);
unset($input);
$iocArray = [];
foreach ($tmpFile->csv($delimiter) as $row) {
if (!empty($row[0][0]) && $row[0][0] === '#') { // Comment
continue;
}
foreach ($row as $elementPos => $element) {
if ((!empty($values) && in_array(($elementPos + 1), $values)) || empty($values)) {
if (empty($values) || in_array(($elementPos + 1), $values)) {
$element = trim($element, " \t\n\r\0\x0B\"\'");
if (isset($settings['excluderegex']) && !empty($settings['excluderegex'])) {
if (preg_match($settings['excluderegex'], $element)) {
continue;
}
if (empty($element)) {
continue;
}
if (!empty($settings['excluderegex']) && preg_match($settings['excluderegex'], $element)) {
continue;
}
$resolvedResult = $this->__resolveType($element);
if (!empty($resolvedResult)) {
if ($resolvedResult) {
$iocArray[] = $resolvedResult;
}
}
}
}
return $iocArray;
}
@ -244,9 +253,8 @@ class ComplexTypeTool
private function __resolveType($raw_input)
{
$input = array(
'raw' => trim($raw_input)
);
$input = array('raw' => trim($raw_input));
$input = $this->__refangInput($input);
$input = $this->__extractPort($input);

View File

@ -37,6 +37,31 @@ class TmpFileTool
}
}
/**
* Get one line from file parsed as CSV.
*
* @param string $delimiter
* @param string $enclosure
* @param string $escape
* @return Generator
* @throws Exception
*/
public function csv($delimiter = ',', $enclosure = '"', $escape = "\\")
{
$this->rewind();
$line = 0;
while (!feof($this->tmpfile)) {
$result = fgetcsv($this->tmpfile, 0, $delimiter, $enclosure, $escape);
if ($result === false) {
throw new Exception("Could not read line $line from temporary CSV file.");
}
$line++;
yield $result;
}
fclose($this->tmpfile);
$this->tmpfile = null;
}
/**
* @return Generator
* @throws Exception

View File

@ -5,25 +5,84 @@ use PHPUnit\Framework\TestCase;
class ComplexTypeToolTest extends TestCase
{
public function testCheckCSV(): void
{
$complexTypeTool = new ComplexTypeTool();
$csv = <<<CSV
# Downloaded from 1.1.1.1
127.0.0.1
"127.0.0.2"
CSV;
$results = $complexTypeTool->checkCSV($csv);
$this->assertCount(2, $results);
}
public function testCheckCSVTabulator(): void
{
$complexTypeTool = new ComplexTypeTool();
$csv = <<<CSV
###########################################################################################
# Downloaded from 1.1.1.1
###########################################################################################
127.0.0.1\t127.0.0.3
"127.0.0.2"
58.214.25.190
58.214.239.53
CSV;
$results = $complexTypeTool->checkCSV($csv, ['delimiter' => '\t']);
$this->assertCount(5, $results);
}
public function testCheckCSVValues(): void
{
$complexTypeTool = new ComplexTypeTool();
$csv = <<<CSV
127.0.0.1\t127.0.0.2
127.0.0.3\t127.0.0.4
CSV;
$results = $complexTypeTool->checkCSV($csv, ['value' => '1', 'delimiter' => '\t']);
$this->assertCount(2, $results);
foreach (['127.0.0.1', '127.0.0.3'] as $k => $test) {
$this->assertEquals($test, $results[$k]['value']);
$this->assertEquals('ip-dst', $results[$k]['default_type']);
}
}
public function testCheckCSVEmpty(): void
{
$complexTypeTool = new ComplexTypeTool();
$results = $complexTypeTool->checkCSV('');
$this->assertCount(0, $results);
}
public function testCheckCSVEmptyLines(): void
{
$complexTypeTool = new ComplexTypeTool();
$results = $complexTypeTool->checkCSV(",,,\t\n,,,,,");
$this->assertCount(0, $results);
}
public function testCheckCSVTestFile(): void
{
$complexTypeTool = new ComplexTypeTool();
$results = $complexTypeTool->checkCSV(file_get_contents(__DIR__ . '/../../tests/event.csv'));
$this->assertCount(37, $results);
}
public function testCheckFreeTextHeader(): void
{
$complexTypeTool = new ComplexTypeTool();
$results = $complexTypeTool->checkFreeText(<<<EOT
$text = <<<EOT
# LAST 1000 # UTC UPDATE 2020-07-13 08:15:00
127.0.0.1,(127.0.0.2), <127.0.0.3>; "127.0.0.4" '127.0.0.5'
EOT
);
127.0.0.1,(127.0.0.2), <127.0.0.3>; "127.0.0.4" "'127.0.0.5'"
EOT;
$results = $complexTypeTool->checkFreeText($text);
$this->assertCount(5, $results);
$this->assertEquals('127.0.0.1', $results[0]['value']);
$this->assertEquals('ip-dst', $results[0]['default_type']);
$this->assertEquals('127.0.0.2', $results[1]['value']);
$this->assertEquals('ip-dst', $results[1]['default_type']);
$this->assertEquals('127.0.0.3', $results[2]['value']);
$this->assertEquals('ip-dst', $results[2]['default_type']);
$this->assertEquals('127.0.0.4', $results[3]['value']);
$this->assertEquals('ip-dst', $results[3]['default_type']);
$this->assertEquals('127.0.0.5', $results[4]['value']);
$this->assertEquals('ip-dst', $results[4]['default_type']);
foreach (['127.0.0.1', '127.0.0.2', '127.0.0.3', '127.0.0.4', '127.0.0.5'] as $k => $test) {
$this->assertEquals($test, $results[$k]['value']);
$this->assertEquals('ip-dst', $results[$k]['default_type']);
}
}
public function testCheckFreeTextIpv4(): void
@ -258,6 +317,15 @@ EOT
$this->assertEquals('url', $results[0]['default_type']);
}
public function testCheckFreeTextUrlWithPort(): void
{
$complexTypeTool = new ComplexTypeTool();
$results = $complexTypeTool->checkFreeText('https://github.com:443/MISP/MISP');
$this->assertCount(1, $results);
$this->assertEquals('https://github.com:443/MISP/MISP', $results[0]['value']);
$this->assertEquals('url', $results[0]['default_type']);
}
public function testCheckFreeTextUrlWithoutProtocol(): void
{
$complexTypeTool = new ComplexTypeTool();