mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-28 00:28:18 +01:00
Compare commits
No commits in common. "8bc4cd419c1d6852fcde541673496b885b91225b" and "d5afbad788d0132d3872962e6fdeafb1ff35da16" have entirely different histories.
8bc4cd419c
...
d5afbad788
@ -12,11 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Use SPDX license data from SPDX themselves instead of spdx-license-list
|
||||
because it is deprecated and outdated
|
||||
- Require Python 3.9+
|
||||
- Don't run `fix.separators()` on title or abstract fields
|
||||
- Don't run whitespace or newline fixes on abstract fields
|
||||
- Ignore some common non-SPDX licenses
|
||||
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
|
||||
for uncommon file extensions
|
||||
- Don't run `fix.separators()` on title fields
|
||||
|
||||
### Updated
|
||||
- Python dependencies
|
||||
|
@ -90,13 +90,11 @@ def run(argv):
|
||||
|
||||
continue
|
||||
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?abstract.*$", column)
|
||||
if match is None:
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||
|
||||
# Fix: newlines
|
||||
if args.unsafe_fixes:
|
||||
df[column] = df[column].apply(fix.newlines, field_name=column)
|
||||
|
||||
# Fix: missing space after comma. Only run on author and citation
|
||||
@ -124,9 +122,9 @@ def run(argv):
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||
# and abstract fields because "|" is used to indicate something like
|
||||
# a subtitle.
|
||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
||||
# field because sometimes "|" is used to indicate something like a
|
||||
# subtitle.
|
||||
match = re.match(r"^.*?title.*$", column)
|
||||
if match is None:
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
@ -286,11 +286,6 @@ def filename_extension(field):
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
# Strip filename descriptions that are meant for SAF Bundler, for
|
||||
# example: Annual_Report_2020.pdf__description:Report
|
||||
if "__description" in value:
|
||||
value = value.split("__")[0]
|
||||
|
||||
# Assume filename extension does not match
|
||||
filename_extension_match = False
|
||||
|
||||
@ -317,19 +312,8 @@ def spdx_license_identifier(field):
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
# List of common non-SPDX licenses to ignore
|
||||
# See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
|
||||
ignore_licenses = {
|
||||
"All rights reserved; no re-use allowed",
|
||||
"All rights reserved; self-archive copy only",
|
||||
"Copyrighted; Non-commercial educational use only",
|
||||
"Copyrighted; Non-commercial use only",
|
||||
"Copyrighted; all rights reserved",
|
||||
"Other",
|
||||
}
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field) or field in ignore_licenses:
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
spdx_licenses = load_spdx_licenses()
|
||||
|
@ -1,17 +0,0 @@
|
||||
id,dc.title,dcterms.abstract
|
||||
1,Normal item,This is an abstract
|
||||
2,Leading whitespace, This is an abstract
|
||||
3,Trailing whitespace,This is an abstract
|
||||
4,Consecutive whitespace,This is an abstract
|
||||
5,Newline,"This
|
||||
is an abstract"
|
||||
6,Newline with leading whitespace," This
|
||||
is an abstract"
|
||||
7,Newline with trailing whitespace,"This
|
||||
is an abstract "
|
||||
8,Newline with consecutive whitespace,"This
|
||||
is an abstract"
|
||||
9,Multiple newlines,"This
|
||||
is
|
||||
an
|
||||
abstract"
|
|
Loading…
Reference in New Issue
Block a user