2024-11-28 08:38:18 +01:00
4 changed files with 10 additions and 49 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,11 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Use SPDX license data from SPDX themselves instead of spdx-license-list
 because it is deprecated and outdated
 - Require Python 3.9+
- Don't run `fix.separators()` on title or abstract fields
+- Don't run `fix.separators()` on title fields
 - Don't run whitespace or newline fixes on abstract fields
 - Ignore some common non-SPDX licenses
 - Ignore `__description` suffix in filenames meant for SAFBuilder when checking
 for uncommon file extensions
 ### Updated
 - Python dependencies
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -90,14 +90,12 @@ def run(argv):
            continue
-        if args.unsafe_fixes:
+        # Fix: whitespace
-            match = re.match(r"^.*?abstract.*$", column)
+        df[column] = df[column].apply(fix.whitespace, field_name=column)
            if match is None:
                # Fix: whitespace
                df[column] = df[column].apply(fix.whitespace, field_name=column)
-                # Fix: newlines
+        # Fix: newlines
-                df[column] = df[column].apply(fix.newlines, field_name=column)
+        if args.unsafe_fixes:
            df[column] = df[column].apply(fix.newlines, field_name=column)
        # Fix: missing space after comma. Only run on author and citation
        # fields for now, as this problem is mostly an issue in names.
@ -124,9 +122,9 @@ def run(argv):
        df[column] = df[column].apply(fix.unnecessary_unicode)
        # Fix: invalid and unnecessary multi-value separators. Skip the title
-        # and abstract fields because "|" is used to indicate something like
+        # field because sometimes "|" is used to indicate something like a
-        # a subtitle.
+        # subtitle.
-        match = re.match(r"^.*?(abstract|title).*$", column)
+        match = re.match(r"^.*?title.*$", column)
        if match is None:
            df[column] = df[column].apply(fix.separators, field_name=column)
            # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -286,11 +286,6 @@ def filename_extension(field):
    # Iterate over all values
    for value in values:
        # Strip filename descriptions that are meant for SAF Bundler, for
        # example: Annual_Report_2020.pdf__description:Report
        if "__description" in value:
            value = value.split("__")[0]
        # Assume filename extension does not match
        filename_extension_match = False
@ -317,19 +312,8 @@ def spdx_license_identifier(field):
    Prints the value if it is invalid.
    """
    # List of common non-SPDX licenses to ignore
    # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
    ignore_licenses = {
        "All rights reserved; no re-use allowed",
        "All rights reserved; self-archive copy only",
        "Copyrighted; Non-commercial educational use only",
        "Copyrighted; Non-commercial use only",
        "Copyrighted; all rights reserved",
        "Other",
    }
    # Skip fields with missing values
-    if pd.isna(field) or field in ignore_licenses:
+    if pd.isna(field):
        return
    spdx_licenses = load_spdx_licenses()
--- a/data/abstract-check.csv
+++ b/data/abstract-check.csv
@ -1,17 +0,0 @@
 id,dc.title,dcterms.abstract
 1,Normal item,This is an abstract
 2,Leading whitespace, This is an abstract
 3,Trailing whitespace,This is an abstract 
 4,Consecutive whitespace,This is  an abstract
 5,Newline,"This
 is an abstract"
 6,Newline with leading whitespace," This
 is an abstract"
 7,Newline with trailing whitespace,"This 
 is an abstract "
 8,Newline with consecutive whitespace,"This
 is an  abstract"
 9,Multiple newlines,"This
 is
 an
 abstract"