1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-09 22:56:01 +02:00

11 Commits

Author SHA1 Message Date
7255bf4707 Version 0.2.1 2019-08-11 10:39:39 +03:00
3aaf18c290 CHANGELOG.md: Move unreleased changes to 0.2.1 2019-08-11 10:39:18 +03:00
745306edd7 CHANGELOG.md: Add note about replacement of unnccesary Unicode 2019-08-11 00:09:35 +03:00
e324e321a2 data/test.csv: Add test for replacement of unneccessary Unicode 2019-08-11 00:08:44 +03:00
232ff99898 csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes
Add a check for soft hyphens (U+00AD). In one sample CSV I have a
normal hyphen followed by a soft hyphen in an ISBN. This causes the
ISBN validation to fail.
2019-08-11 00:07:21 +03:00
13d5221378 csv_metadata_quality/check.py: Fix test for False 2019-08-10 23:52:53 +03:00
3c7a9eb75b CHANGELOG.md: Add check for uncommon filename extensions 2019-08-10 23:47:46 +03:00
a99fbd8a51 data/test.csv: Add test case for uncommon filename extension 2019-08-10 23:46:56 +03:00
e801042340 tests/test_check.py: Fix unused result
We don't need to capture the function's return value here because
pytest will capture stdout from the function.
2019-08-10 23:45:41 +03:00
62ef2a4489 tests/test_check.py: Add tests for file extensions 2019-08-10 23:44:13 +03:00
9ce7dc6716 Add check for uncommon filenames
Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.
2019-08-10 23:41:16 +03:00
8 changed files with 114 additions and 26 deletions

View File

@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.2.1] - 2019-08-11
### Added
- Check for uncommon filename extensions
- Replacement of unneccessary Unicode characters like soft hyphens (U+00AD)
## [0.2.0] - 2019-08-09
### Added
- Handle Ctrl-C interrupt gracefully

View File

@ -86,6 +86,10 @@ def run(argv):
if match is not None:
df[column] = df[column].apply(check.date)
# Check: filename extension
if column == 'filename':
df[column] = df[column].apply(check.filename_extension)
# Write
df.to_csv(args.output_file, index=False)

View File

@ -253,3 +253,48 @@ def agrovoc(field, field_name):
print(f'Invalid AGROVOC ({field_name}): {value}')
return field
def filename_extension(field):
"""Check filename extension.
CSVs with a 'filename' column are likely meant as input for the SAFBuilder
tool, which creates a Simple Archive Format bundle for importing metadata
with accompanying PDFs or other files into DSpace.
This check warns if a filename has an uncommon extension (that is, other
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
"""
import re
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
values = field.split('||')
# List of common filename extentions
common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']
# Iterate over all values
for value in values:
# Assume filename extension does not match
filename_extension_match = False
for filename_extension in common_filename_extensions:
# Check for extension at the end of the filename
pattern = re.escape(filename_extension) + r'$'
match = re.search(pattern, value, re.IGNORECASE)
if match is not None:
# Register the match and stop checking for this filename
filename_extension_match = True
break
if filename_extension_match is False:
print(f'Filename with uncommon extension: {value}')
return field

View File

@ -68,14 +68,17 @@ def separators(field):
def unnecessary_unicode(field):
"""Remove unnecessary Unicode characters.
"""Remove and replace unnecessary Unicode characters.
Removes unnecessary Unicode characters like:
- Zero-width space (U+200B)
- Replacement character (U+FFFD)
- No-break space (U+00A0)
Return string with characters removed.
Replaces unnecessary Unicode characters like:
- Soft hyphen (U+00AD) → hyphen
Return string with characters removed or replaced.
"""
# Skip fields with missing values
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
print(f'Removing unnecessary Unicode (U+00A0): {field}')
field = re.sub(pattern, '', field)
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
pattern = re.compile(r'\u002D*?\u00AD')
match = re.findall(pattern, field)
if match:
print(f'Replacing unnecessary Unicode (U+00AD): {field}')
field = re.sub(pattern, '-', field)
return field

View File

@ -1 +1 @@
VERSION = '0.2.0'
VERSION = '0.2.1'

View File

@ -1,23 +1,25 @@
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country
Leading space,2019-07-29,,,,,
Trailing space ,2019-07-29,,,,,
Excessive space,2019-07-29,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,
Duplicate||Duplicate,2019-07-29,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,
Invalid date,2019-07-260,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,
Unnecessary Unicode,2019-07-29,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,
Invalid ISO 639-2 language,2019-07-29,,,jp,,
Invalid ISO 639-3 language,2019-07-29,,,chi,,
Invalid language,2019-07-29,,,Span,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
Leading space,2019-07-29,,,,,,
Trailing space ,2019-07-29,,,,,,
Excessive space,2019-07-29,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
Invalid date,2019-07-260,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
Unnecessary Unicode,2019-07-29,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,
Invalid ISO 639-2 language,2019-07-29,,,jp,,,
Invalid ISO 639-3 language,2019-07-29,,,chi,,,
Invalid language,2019-07-29,,,Span,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
Newline (LF),2019-07-30,,,,"TANZA
NIA",
Missing date,,,,,,
Invalid country,2019-08-01,,,,,KENYAA
NIA",,
Missing date,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,

1 dc.contributor.author birthdate dc.identifier.issn dc.identifier.isbn dc.language.iso dc.subject cg.coverage.country filename
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 2019-07-29 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-2 language 2019-07-29 jp
17 Invalid ISO 639-3 language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA
23 Uncommon filename extension 2019-08-10 file.pdf.lck
24 Unneccesary unicode (U+002D + U+00AD) 2019-08-10 978-­92-­9043-­823-­6
25

View File

@ -13,7 +13,7 @@ install_requires = [
setuptools.setup(
name="csv-metadata-quality",
version="0.2.0",
version="0.2.1",
author="Alan Orth",
author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",

View File

@ -194,3 +194,24 @@ def test_check_valid_agrovoc():
result = check.agrovoc(value, field_name)
assert result == value
def test_check_uncommon_filename_extension(capsys):
'''Test uncommon filename extension.'''
value = 'file.pdf.lck'
check.filename_extension(value)
captured = capsys.readouterr()
assert captured.out == f'Filename with uncommon extension: {value}\n'
def test_check_common_filename_extension():
'''Test common filename extension.'''
value = 'file.pdf'
result = check.filename_extension(value)
assert result == value