Updated delimiter finder function

pull/190/head
chrisr3d 2018-05-18 11:38:13 +02:00
parent 1fb72f3c7a
commit 2b509a2fd3
No known key found for this signature in database
GPG Key ID: 6BBED1B63A6D639F
1 changed files with 23 additions and 20 deletions

View File

@ -19,6 +19,7 @@ userConfig = {'header': {
duplicatedFields = {'mispType': {'mispComment': 'comment'},
'attrField': {'attrComment': 'comment'}}
attributesFields = ['type', 'value', 'category', 'to_ids', 'comment', 'distribution']
delimiters = [',', ';', '|', '/', '\t', ' ']
class CsvParser():
def __init__(self, header, has_header):
@ -29,29 +30,31 @@ class CsvParser():
def parse_data(self, data):
return_data = []
if self.fields_number == 1:
for line in data:
l = line.split('#')[0].strip() if '#' in line else line.strip()
l = line.split('#')[0].strip()
if l:
return_data.append(l)
self.data = return_data[1:] if self.has_header else return_data
self.delimiter = None
else:
self.delimiter_count = dict([(d, 0) for d in delimiters])
for line in data:
l = line.split('#')[0].strip()
if l:
self.parse_delimiter(l)
return_data.append(l)
# find which delimiter is used
self.delimiter = self.find_delimiter()
self.data = return_data[1:] if self.has_header else return_data
def parse_delimiter(self, line):
for d in delimiters:
if line.count(d) >= (self.fields_number - 1):
self.delimiter_count[d] += 1
def find_delimiter(self):
n = self.fields_number
if n > 1:
tmpData = []
for da in self.data:
tmp = []
for d in (';', '|', '/', ',', '\t', ' ',):
if da.count(d) == (n-1):
tmp.append(d)
if len(tmp) == 1 and tmp == tmpData:
return tmpData[0]
else:
tmpData = tmp
else:
return None
_, delimiter = max((n, v) for v, n in self.delimiter_count.items())
return delimiter
def buildAttributes(self):
# if there is only 1 field of data