1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 12:12:18 +01:00

Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.
This commit is contained in:
Alan Orth 2019-08-10 23:41:16 +03:00
parent 5ff584a8d7
commit 9ce7dc6716
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 49 additions and 0 deletions

View File

@ -86,6 +86,10 @@ def run(argv):
if match is not None: if match is not None:
df[column] = df[column].apply(check.date) df[column] = df[column].apply(check.date)
# Check: filename extension
if column == 'filename':
df[column] = df[column].apply(check.filename_extension)
# Write # Write
df.to_csv(args.output_file, index=False) df.to_csv(args.output_file, index=False)

View File

@ -253,3 +253,48 @@ def agrovoc(field, field_name):
print(f'Invalid AGROVOC ({field_name}): {value}') print(f'Invalid AGROVOC ({field_name}): {value}')
return field return field
def filename_extension(field):
"""Check filename extension.
CSVs with a 'filename' column are likely meant as input for the SAFBuilder
tool, which creates a Simple Archive Format bundle for importing metadata
with accompanying PDFs or other files into DSpace.
This check warns if a filename has an uncommon extension (that is, other
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
"""
import re
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
values = field.split('||')
# List of common filename extentions
common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']
# Iterate over all values
for value in values:
# Assume filename extension does not match
filename_extension_match = False
for filename_extension in common_filename_extensions:
# Check for extension at the end of the filename
pattern = re.escape(filename_extension) + r'$'
match = re.search(pattern, value, re.IGNORECASE)
if match is not None:
# Register the match and stop checking for this filename
filename_extension_match = True
break
if filename_extension_match == False:
print(f'Filename with uncommon extension: {value}')
return field