mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-28 08:38:18 +01:00
Compare commits
No commits in common. "8bc4cd419c1d6852fcde541673496b885b91225b" and "d5afbad788d0132d3872962e6fdeafb1ff35da16" have entirely different histories.
8bc4cd419c
...
d5afbad788
@ -12,11 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Use SPDX license data from SPDX themselves instead of spdx-license-list
|
- Use SPDX license data from SPDX themselves instead of spdx-license-list
|
||||||
because it is deprecated and outdated
|
because it is deprecated and outdated
|
||||||
- Require Python 3.9+
|
- Require Python 3.9+
|
||||||
- Don't run `fix.separators()` on title or abstract fields
|
- Don't run `fix.separators()` on title fields
|
||||||
- Don't run whitespace or newline fixes on abstract fields
|
|
||||||
- Ignore some common non-SPDX licenses
|
|
||||||
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
|
|
||||||
for uncommon file extensions
|
|
||||||
|
|
||||||
### Updated
|
### Updated
|
||||||
- Python dependencies
|
- Python dependencies
|
||||||
|
@ -90,14 +90,12 @@ def run(argv):
|
|||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if args.unsafe_fixes:
|
# Fix: whitespace
|
||||||
match = re.match(r"^.*?abstract.*$", column)
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
if match is None:
|
|
||||||
# Fix: whitespace
|
|
||||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
|
||||||
|
|
||||||
# Fix: newlines
|
# Fix: newlines
|
||||||
df[column] = df[column].apply(fix.newlines, field_name=column)
|
if args.unsafe_fixes:
|
||||||
|
df[column] = df[column].apply(fix.newlines, field_name=column)
|
||||||
|
|
||||||
# Fix: missing space after comma. Only run on author and citation
|
# Fix: missing space after comma. Only run on author and citation
|
||||||
# fields for now, as this problem is mostly an issue in names.
|
# fields for now, as this problem is mostly an issue in names.
|
||||||
@ -124,9 +122,9 @@ def run(argv):
|
|||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
# and abstract fields because "|" is used to indicate something like
|
# field because sometimes "|" is used to indicate something like a
|
||||||
# a subtitle.
|
# subtitle.
|
||||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
match = re.match(r"^.*?title.*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# Run whitespace fix again after fixing invalid separators
|
||||||
|
@ -286,11 +286,6 @@ def filename_extension(field):
|
|||||||
|
|
||||||
# Iterate over all values
|
# Iterate over all values
|
||||||
for value in values:
|
for value in values:
|
||||||
# Strip filename descriptions that are meant for SAF Bundler, for
|
|
||||||
# example: Annual_Report_2020.pdf__description:Report
|
|
||||||
if "__description" in value:
|
|
||||||
value = value.split("__")[0]
|
|
||||||
|
|
||||||
# Assume filename extension does not match
|
# Assume filename extension does not match
|
||||||
filename_extension_match = False
|
filename_extension_match = False
|
||||||
|
|
||||||
@ -317,19 +312,8 @@ def spdx_license_identifier(field):
|
|||||||
Prints the value if it is invalid.
|
Prints the value if it is invalid.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# List of common non-SPDX licenses to ignore
|
|
||||||
# See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
|
|
||||||
ignore_licenses = {
|
|
||||||
"All rights reserved; no re-use allowed",
|
|
||||||
"All rights reserved; self-archive copy only",
|
|
||||||
"Copyrighted; Non-commercial educational use only",
|
|
||||||
"Copyrighted; Non-commercial use only",
|
|
||||||
"Copyrighted; all rights reserved",
|
|
||||||
"Other",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
if pd.isna(field) or field in ignore_licenses:
|
if pd.isna(field):
|
||||||
return
|
return
|
||||||
|
|
||||||
spdx_licenses = load_spdx_licenses()
|
spdx_licenses = load_spdx_licenses()
|
||||||
|
@ -1,17 +0,0 @@
|
|||||||
id,dc.title,dcterms.abstract
|
|
||||||
1,Normal item,This is an abstract
|
|
||||||
2,Leading whitespace, This is an abstract
|
|
||||||
3,Trailing whitespace,This is an abstract
|
|
||||||
4,Consecutive whitespace,This is an abstract
|
|
||||||
5,Newline,"This
|
|
||||||
is an abstract"
|
|
||||||
6,Newline with leading whitespace," This
|
|
||||||
is an abstract"
|
|
||||||
7,Newline with trailing whitespace,"This
|
|
||||||
is an abstract "
|
|
||||||
8,Newline with consecutive whitespace,"This
|
|
||||||
is an abstract"
|
|
||||||
9,Multiple newlines,"This
|
|
||||||
is
|
|
||||||
an
|
|
||||||
abstract"
|
|
|
Loading…
Reference in New Issue
Block a user