2024-11-28 00:28:18 +01:00
4 changed files with 10 additions and 49 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,11 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Use SPDX license data from SPDX themselves instead of spdx-license-list
 because it is deprecated and outdated
 - Require Python 3.9+
- Don't run `fix.separators()` on title or abstract fields
- Don't run whitespace or newline fixes on abstract fields
- Ignore some common non-SPDX licenses
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
-for uncommon file extensions
+- Don't run `fix.separators()` on title fields

 ### Updated
 - Python dependencies
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -90,13 +90,11 @@ def run(argv):

            continue

-        if args.unsafe_fixes:
-            match = re.match(r"^.*?abstract.*$", column)
-            if match is None:
        # Fix: whitespace
        df[column] = df[column].apply(fix.whitespace, field_name=column)

        # Fix: newlines
+        if args.unsafe_fixes:
            df[column] = df[column].apply(fix.newlines, field_name=column)

        # Fix: missing space after comma. Only run on author and citation
@ -124,9 +122,9 @@ def run(argv):
        df[column] = df[column].apply(fix.unnecessary_unicode)

        # Fix: invalid and unnecessary multi-value separators. Skip the title
-        # and abstract fields because "|" is used to indicate something like
-        # a subtitle.
-        match = re.match(r"^.*?(abstract|title).*$", column)
+        # field because sometimes "|" is used to indicate something like a
+        # subtitle.
+        match = re.match(r"^.*?title.*$", column)
        if match is None:
            df[column] = df[column].apply(fix.separators, field_name=column)
            # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -286,11 +286,6 @@ def filename_extension(field):

    # Iterate over all values
    for value in values:
-        # Strip filename descriptions that are meant for SAF Bundler, for
-        # example: Annual_Report_2020.pdf__description:Report
-        if "__description" in value:
-            value = value.split("__")[0]
-
        # Assume filename extension does not match
        filename_extension_match = False

@ -317,19 +312,8 @@ def spdx_license_identifier(field):
    Prints the value if it is invalid.
    """

-    # List of common non-SPDX licenses to ignore
-    # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
-    ignore_licenses = {
-        "All rights reserved; no re-use allowed",
-        "All rights reserved; self-archive copy only",
-        "Copyrighted; Non-commercial educational use only",
-        "Copyrighted; Non-commercial use only",
-        "Copyrighted; all rights reserved",
-        "Other",
-    }
-
    # Skip fields with missing values
-    if pd.isna(field) or field in ignore_licenses:
+    if pd.isna(field):
        return

    spdx_licenses = load_spdx_licenses()
--- a/data/abstract-check.csv
+++ b/data/abstract-check.csv
@ -1,17 +0,0 @@
-id,dc.title,dcterms.abstract
-1,Normal item,This is an abstract
-2,Leading whitespace, This is an abstract
-3,Trailing whitespace,This is an abstract 
-4,Consecutive whitespace,This is  an abstract
-5,Newline,"This
-is an abstract"
-6,Newline with leading whitespace," This
- is an abstract"
-7,Newline with trailing whitespace,"This 
-is an abstract "
-8,Newline with consecutive whitespace,"This
-is an  abstract"
-9,Multiple newlines,"This
-is
-an
-abstract"