From fbb625be5cb76aa88b551df3f024e108dfa5efff Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 7 Feb 2023 17:01:56 +0300 Subject: [PATCH] Ignore common non-SPDX licenses This is meant to catch licenses that are supposed to be SPDX but aren't, not licenses that *aren't* supposed to be SPDX. We have so many free-text license descriptions like "Copyrighted" and "Other" that I'm sick of seeing warnings for them! --- CHANGELOG.md | 1 + csv_metadata_quality/check.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dbef54..e9b9a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ because it is deprecated and outdated - Require Python 3.9+ - Don't run `fix.separators()` on title fields - Don't run whitespace or newline fixes on abstract fields +- Ignore some common non-SPDX licenses ### Updated - Python dependencies diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 465895d..e188b32 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -312,8 +312,19 @@ def spdx_license_identifier(field): Prints the value if it is invalid. """ + # List of common non-SPDX licenses to ignore + # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt + ignore_licenses = { + "All rights reserved; no re-use allowed", + "All rights reserved; self-archive copy only", + "Copyrighted; Non-commercial educational use only", + "Copyrighted; Non-commercial use only", + "Copyrighted; all rights reserved", + "Other", + } + # Skip fields with missing values - if pd.isna(field): + if pd.isna(field) or field in ignore_licenses: return spdx_licenses = load_spdx_licenses()