Merge pull request #188 from cvandeplas/master

ta import  - noise removal
pull/189/head
Christophe Vandeplas 2018-05-16 11:53:19 +02:00 committed by GitHub
commit e32a39c6f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 16 additions and 12 deletions

View File

@ -421,20 +421,22 @@ def cleanup_url(item):
def cleanup_filepath(item): def cleanup_filepath(item):
noise_substrings = { noise_substrings = {
'C:\\Windows\\Prefetch\\', '\\AppData\\Local\\GDIPFONTCACHEV1.DAT',
'\\AppData\\Roaming\\Microsoft\\Windows\\Recent\\',
'\\AppData\\Roaming\\Microsoft\\Office\\Recent\\',
'C:\\ProgramData\\Microsoft\\OfficeSoftwareProtectionPlatform\\Cache\\cache.dat',
'\\AppData\\Local\\Microsoft\\Windows\\Temporary Internet Files\\Content.',
'\\AppData\\Local\\Microsoft\\Internet Explorer\\Recovery\\High\\',
'\\AppData\\Local\\Microsoft\\Internet Explorer\\DOMStore\\', '\\AppData\\Local\\Microsoft\\Internet Explorer\\DOMStore\\',
'\\AppData\\LocalLow\\Microsoft\\Internet Explorer\\Services\\search_', '\\AppData\\Local\\Microsoft\\Internet Explorer\\Recovery\\High\\',
'\\AppData\\Local\\Microsoft\\Windows\\History\\History.',
'\\AppData\\Roaming\\Microsoft\\Windows\\Cookies\\',
'\\AppData\\LocalLow\\Microsoft\\CryptnetUrlCache\\',
'\\AppData\\Local\\Microsoft\\Windows\\Caches\\', '\\AppData\\Local\\Microsoft\\Windows\\Caches\\',
'\\AppData\\Local\\Microsoft\\Windows\WebCache\\',
'\\AppData\\Local\\Microsoft\\Windows\\Explorer\\thumbcache', '\\AppData\\Local\\Microsoft\\Windows\\Explorer\\thumbcache',
'\\AppData\\Local\\Microsoft\\Windows\\History\\History.',
'\\AppData\\Local\\Microsoft\\Windows\\Temporary Internet Files\\Content.',
'\\AppData\\Local\\Microsoft\\Windows\\WebCache\\',
'\\AppData\\Local\\Temp\\.*tmp$',
'\\AppData\\LocalLow\\Microsoft\\CryptnetUrlCache\\',
'\\AppData\\LocalLow\\Microsoft\\Internet Explorer\\Services\\search_',
'\\AppData\\Roaming\\Microsoft\\Office\\Recent\\',
'\\AppData\\Roaming\\Microsoft\\Windows\\Cookies\\',
'\\AppData\\Roaming\\Microsoft\\Windows\\Recent\\',
'C:\\ProgramData\\Microsoft\\OfficeSoftwareProtectionPlatform\\Cache\\cache.dat',
'C:\\Windows\\Prefetch\\',
'\\AppData\\Roaming\\Adobe\\Acrobat\\9.0\\SharedDataEvents-journal', '\\AppData\\Roaming\\Adobe\\Acrobat\\9.0\\SharedDataEvents-journal',
'\\AppData\\Roaming\\Adobe\\Acrobat\\9.0\\UserCache.bin', '\\AppData\\Roaming\\Adobe\\Acrobat\\9.0\\UserCache.bin',
@ -460,14 +462,16 @@ def cleanup_regkey(item):
r'\\Software\\Microsoft\\Internet Explorer\\Main\\WindowsSearch', r'\\Software\\Microsoft\\Internet Explorer\\Main\\WindowsSearch',
r'\\Software\\Microsoft\\Office\\[0-9\.]+\\', r'\\Software\\Microsoft\\Office\\[0-9\.]+\\',
r'\\Software\\Microsoft\\Office\\Common\\Smart Tag\\', r'\\Software\\Microsoft\\Office\\Common\\Smart Tag\\',
r'\\SOFTWARE\\Microsoft\\OfficeSoftwareProtectionPlatform\\', r'\\Software\\Microsoft\\OfficeSoftwareProtectionPlatform\\',
r'\\Software\\Microsoft\\Shared Tools\\Panose\\', r'\\Software\\Microsoft\\Shared Tools\\Panose\\',
r'\\Software\\Microsoft\\Tracing\\', r'\\Software\\Microsoft\\Tracing\\',
r'\\Software\\Microsoft\\Tracing\\powershell_RASAPI32\\', r'\\Software\\Microsoft\\Tracing\\powershell_RASAPI32\\',
r'\\Software\\Microsoft\\Tracing\\powershell_RASMANCS\\', r'\\Software\\Microsoft\\Tracing\\powershell_RASMANCS\\',
r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Action Center\\',
r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\RunMRU\\', r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\RunMRU\\',
r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Installer\\UserData\\', r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Installer\\UserData\\',
r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings\\', r'\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings\\',
r'\\System\\CurrentControlSet\\Services\\RdyBoost\\',
r'\\Usage\\SpellingAndGrammarFiles' r'\\Usage\\SpellingAndGrammarFiles'
} }
if list_in_string(noise_substrings, item, regex=True): if list_in_string(noise_substrings, item, regex=True):