1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-28 08:38:18 +01:00

Compare commits

..

No commits in common. "8bc4cd419c1d6852fcde541673496b885b91225b" and "d5afbad788d0132d3872962e6fdeafb1ff35da16" have entirely different histories.

4 changed files with 10 additions and 49 deletions

View File

@ -12,11 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Use SPDX license data from SPDX themselves instead of spdx-license-list - Use SPDX license data from SPDX themselves instead of spdx-license-list
because it is deprecated and outdated because it is deprecated and outdated
- Require Python 3.9+ - Require Python 3.9+
- Don't run `fix.separators()` on title or abstract fields - Don't run `fix.separators()` on title fields
- Don't run whitespace or newline fixes on abstract fields
- Ignore some common non-SPDX licenses
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
for uncommon file extensions
### Updated ### Updated
- Python dependencies - Python dependencies

View File

@ -90,14 +90,12 @@ def run(argv):
continue continue
if args.unsafe_fixes: # Fix: whitespace
match = re.match(r"^.*?abstract.*$", column) df[column] = df[column].apply(fix.whitespace, field_name=column)
if match is None:
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines # Fix: newlines
df[column] = df[column].apply(fix.newlines, field_name=column) if args.unsafe_fixes:
df[column] = df[column].apply(fix.newlines, field_name=column)
# Fix: missing space after comma. Only run on author and citation # Fix: missing space after comma. Only run on author and citation
# fields for now, as this problem is mostly an issue in names. # fields for now, as this problem is mostly an issue in names.
@ -124,9 +122,9 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode) df[column] = df[column].apply(fix.unnecessary_unicode)
# Fix: invalid and unnecessary multi-value separators. Skip the title # Fix: invalid and unnecessary multi-value separators. Skip the title
# and abstract fields because "|" is used to indicate something like # field because sometimes "|" is used to indicate something like a
# a subtitle. # subtitle.
match = re.match(r"^.*?(abstract|title).*$", column) match = re.match(r"^.*?title.*$", column)
if match is None: if match is None:
df[column] = df[column].apply(fix.separators, field_name=column) df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators # Run whitespace fix again after fixing invalid separators

View File

@ -286,11 +286,6 @@ def filename_extension(field):
# Iterate over all values # Iterate over all values
for value in values: for value in values:
# Strip filename descriptions that are meant for SAF Bundler, for
# example: Annual_Report_2020.pdf__description:Report
if "__description" in value:
value = value.split("__")[0]
# Assume filename extension does not match # Assume filename extension does not match
filename_extension_match = False filename_extension_match = False
@ -317,19 +312,8 @@ def spdx_license_identifier(field):
Prints the value if it is invalid. Prints the value if it is invalid.
""" """
# List of common non-SPDX licenses to ignore
# See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
ignore_licenses = {
"All rights reserved; no re-use allowed",
"All rights reserved; self-archive copy only",
"Copyrighted; Non-commercial educational use only",
"Copyrighted; Non-commercial use only",
"Copyrighted; all rights reserved",
"Other",
}
# Skip fields with missing values # Skip fields with missing values
if pd.isna(field) or field in ignore_licenses: if pd.isna(field):
return return
spdx_licenses = load_spdx_licenses() spdx_licenses = load_spdx_licenses()

View File

@ -1,17 +0,0 @@
id,dc.title,dcterms.abstract
1,Normal item,This is an abstract
2,Leading whitespace, This is an abstract
3,Trailing whitespace,This is an abstract
4,Consecutive whitespace,This is an abstract
5,Newline,"This
is an abstract"
6,Newline with leading whitespace," This
is an abstract"
7,Newline with trailing whitespace,"This
is an abstract "
8,Newline with consecutive whitespace,"This
is an abstract"
9,Multiple newlines,"This
is
an
abstract"
1 id dc.title dcterms.abstract
2 1 Normal item This is an abstract
3 2 Leading whitespace This is an abstract
4 3 Trailing whitespace This is an abstract
5 4 Consecutive whitespace This is an abstract
6 5 Newline This is an abstract
7 6 Newline with leading whitespace This is an abstract
8 7 Newline with trailing whitespace This is an abstract
9 8 Newline with consecutive whitespace This is an abstract
10 9 Multiple newlines This is an abstract