Version 0.2.1

CHANGELOG.md: Move unreleased changes to 0.2.1
CHANGELOG.md: Add note about replacement of unnccesary Unicode
2025-05-16 01:33:00 +02:00 · 2019-08-11 10:39:39 +03:00 · 2019-08-11 10:39:18 +03:00 · 2019-08-11 00:09:35 +03:00 · 2019-08-11 00:08:44 +03:00 · 2019-08-11 00:07:21 +03:00
8 changed files with 114 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [0.2.1] - 2019-08-11
 ### Added
 - Check for uncommon filename extensions 
 - Replacement of unneccessary Unicode characters like soft hyphens (U+00AD)
 ## [0.2.0] - 2019-08-09
 ### Added
 - Handle Ctrl-C interrupt gracefully
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -86,6 +86,10 @@ def run(argv):
        if match is not None:
            df[column] = df[column].apply(check.date)
        # Check: filename extension
        if column == 'filename':
            df[column] = df[column].apply(check.filename_extension)
    # Write
    df.to_csv(args.output_file, index=False)
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -253,3 +253,48 @@ def agrovoc(field, field_name):
                    print(f'Invalid AGROVOC ({field_name}): {value}')
    return field
 def filename_extension(field):
    """Check filename extension.
    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
    tool, which creates a Simple Archive Format bundle for importing metadata
    with accompanying PDFs or other files into DSpace.
    This check warns if a filename has an uncommon extension (that is, other
    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
    """
    import re
    # Skip fields with missing values
    if pd.isna(field):
        return
    # Try to split multi-value field on "||" separator
    values = field.split('||')
    # List of common filename extentions
    common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']
    # Iterate over all values
    for value in values:
        # Assume filename extension does not match
        filename_extension_match = False
        for filename_extension in common_filename_extensions:
            # Check for extension at the end of the filename
            pattern = re.escape(filename_extension) + r'$'
            match = re.search(pattern, value, re.IGNORECASE)
            if match is not None:
                # Register the match and stop checking for this filename
                filename_extension_match = True
                break
        if filename_extension_match is False:
            print(f'Filename with uncommon extension: {value}')
    return field
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -68,14 +68,17 @@ def separators(field):
 def unnecessary_unicode(field):
-    """Remove unnecessary Unicode characters.
+    """Remove and replace unnecessary Unicode characters.
    Removes unnecessary Unicode characters like:
        - Zero-width space (U+200B)
        - Replacement character (U+FFFD)
        - No-break space (U+00A0)
-    Return string with characters removed.
+    Replaces unnecessary Unicode characters like:
        - Soft hyphen (U+00AD) → hyphen
    Return string with characters removed or replaced.
    """
    # Skip fields with missing values
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
        print(f'Removing unnecessary Unicode (U+00A0): {field}')
        field = re.sub(pattern, '', field)
    # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
    pattern = re.compile(r'\u002D*?\u00AD')
    match = re.findall(pattern, field)
    if match:
        print(f'Replacing unnecessary Unicode (U+00AD): {field}')
        field = re.sub(pattern, '-', field)
    return field
--- a/csv_metadata_quality/version.py
+++ b/csv_metadata_quality/version.py
@ -1 +1 @@
-VERSION = '0.2.0'
+VERSION = '0.2.1'
--- a/data/test.csv
+++ b/data/test.csv
@ -1,23 +1,25 @@
-dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country
+dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
- Leading space,2019-07-29,,,,,
+ Leading space,2019-07-29,,,,,,
-Trailing space ,2019-07-29,,,,,
+Trailing space ,2019-07-29,,,,,,
-Excessive  space,2019-07-29,,,,,
+Excessive  space,2019-07-29,,,,,,
-Miscellaenous ||whitespace | issues ,2019-07-29,,,,,
+Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
-Duplicate||Duplicate,2019-07-29,,,,,
+Duplicate||Duplicate,2019-07-29,,,,,,
-Invalid ISSN,2019-07-29,2321-2302,,,,
+Invalid ISSN,2019-07-29,2321-2302,,,,,
-Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,
+Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
-Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,
+Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
-Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,
+Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
-Invalid date,2019-07-260,,,,,
+Invalid date,2019-07-260,,,,,,
-Multiple dates,2019-07-26||2019-01-10,,,,,
+Multiple dates,2019-07-26||2019-01-10,,,,,,
-Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,
+Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
-Unnecessary Unicode,2019-07-29,,,,,
+Unnecessary Unicode,2019-07-29,,,,,,
-Suspicious character||foreˆt,2019-07-29,,,,,
+Suspicious character||foreˆt,2019-07-29,,,,,,
-Invalid ISO 639-2 language,2019-07-29,,,jp,,
+Invalid ISO 639-2 language,2019-07-29,,,jp,,,
-Invalid ISO 639-3 language,2019-07-29,,,chi,,
+Invalid ISO 639-3 language,2019-07-29,,,chi,,,
-Invalid language,2019-07-29,,,Span,,
+Invalid language,2019-07-29,,,Span,,,
-Invalid AGROVOC subject,2019-07-29,,,,FOREST,
+Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
 Newline (LF),2019-07-30,,,,"TANZA
-NIA",
+NIA",,
-Missing date,,,,,,
+Missing date,,,,,,,
-Invalid country,2019-08-01,,,,,KENYAA
+Invalid country,2019-08-01,,,,,KENYAA,
 Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
 Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ install_requires = [
 setuptools.setup(
    name="csv-metadata-quality",
-    version="0.2.0",
+    version="0.2.1",
    author="Alan Orth",
    author_email="aorth@mjanja.ch",
    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -194,3 +194,24 @@ def test_check_valid_agrovoc():
    result = check.agrovoc(value, field_name)
    assert result == value
 def test_check_uncommon_filename_extension(capsys):
    '''Test uncommon filename extension.'''
    value = 'file.pdf.lck'
    check.filename_extension(value)
    captured = capsys.readouterr()
    assert captured.out == f'Filename with uncommon extension: {value}\n'
 def test_check_common_filename_extension():
    '''Test common filename extension.'''
    value = 'file.pdf'
    result = check.filename_extension(value)
    assert result == value
Author	SHA1	Message	Date
Alan Orth	7255bf4707	Version 0.2.1	2019-08-11 10:39:39 +03:00
Alan Orth	3aaf18c290	CHANGELOG.md: Move unreleased changes to 0.2.1	2019-08-11 10:39:18 +03:00
Alan Orth	745306edd7	CHANGELOG.md: Add note about replacement of unnccesary Unicode	2019-08-11 00:09:35 +03:00
Alan Orth	e324e321a2	data/test.csv: Add test for replacement of unneccessary Unicode	2019-08-11 00:08:44 +03:00
Alan Orth	232ff99898	csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail.	2019-08-11 00:07:21 +03:00
Alan Orth	13d5221378	csv_metadata_quality/check.py: Fix test for False	2019-08-10 23:52:53 +03:00
Alan Orth	3c7a9eb75b	CHANGELOG.md: Add check for uncommon filename extensions	2019-08-10 23:47:46 +03:00
Alan Orth	a99fbd8a51	data/test.csv: Add test case for uncommon filename extension	2019-08-10 23:46:56 +03:00
Alan Orth	e801042340	tests/test_check.py: Fix unused result We don't need to capture the function's return value here because pytest will capture stdout from the function.	2019-08-10 23:45:41 +03:00
Alan Orth	62ef2a4489	tests/test_check.py: Add tests for file extensions	2019-08-10 23:44:13 +03:00
Alan Orth	9ce7dc6716	Add check for uncommon filenames Generally we want people to upload documents in accessible formats like PDF, Word, Excel, and PowerPoint. This check warns if a file is using an uncommon extension.	2019-08-10 23:41:16 +03:00