mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-28 00:28:18 +01:00
Compare commits
7 Commits
d5afbad788
...
8bc4cd419c
Author | SHA1 | Date | |
---|---|---|---|
8bc4cd419c | |||
bde38e9ed4 | |||
8db1e36a6d | |||
fbb625be5c | |||
084b970798 | |||
171b35b015 | |||
545bb8cd0c |
@ -12,7 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Use SPDX license data from SPDX themselves instead of spdx-license-list
|
- Use SPDX license data from SPDX themselves instead of spdx-license-list
|
||||||
because it is deprecated and outdated
|
because it is deprecated and outdated
|
||||||
- Require Python 3.9+
|
- Require Python 3.9+
|
||||||
- Don't run `fix.separators()` on title fields
|
- Don't run `fix.separators()` on title or abstract fields
|
||||||
|
- Don't run whitespace or newline fixes on abstract fields
|
||||||
|
- Ignore some common non-SPDX licenses
|
||||||
|
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
|
||||||
|
for uncommon file extensions
|
||||||
|
|
||||||
### Updated
|
### Updated
|
||||||
- Python dependencies
|
- Python dependencies
|
||||||
|
@ -90,12 +90,14 @@ def run(argv):
|
|||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Fix: whitespace
|
|
||||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
|
||||||
|
|
||||||
# Fix: newlines
|
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
df[column] = df[column].apply(fix.newlines, field_name=column)
|
match = re.match(r"^.*?abstract.*$", column)
|
||||||
|
if match is None:
|
||||||
|
# Fix: whitespace
|
||||||
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
|
|
||||||
|
# Fix: newlines
|
||||||
|
df[column] = df[column].apply(fix.newlines, field_name=column)
|
||||||
|
|
||||||
# Fix: missing space after comma. Only run on author and citation
|
# Fix: missing space after comma. Only run on author and citation
|
||||||
# fields for now, as this problem is mostly an issue in names.
|
# fields for now, as this problem is mostly an issue in names.
|
||||||
@ -122,9 +124,9 @@ def run(argv):
|
|||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
# field because sometimes "|" is used to indicate something like a
|
# and abstract fields because "|" is used to indicate something like
|
||||||
# subtitle.
|
# a subtitle.
|
||||||
match = re.match(r"^.*?title.*$", column)
|
match = re.match(r"^.*?(abstract|title).*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# Run whitespace fix again after fixing invalid separators
|
||||||
|
@ -286,6 +286,11 @@ def filename_extension(field):
|
|||||||
|
|
||||||
# Iterate over all values
|
# Iterate over all values
|
||||||
for value in values:
|
for value in values:
|
||||||
|
# Strip filename descriptions that are meant for SAF Bundler, for
|
||||||
|
# example: Annual_Report_2020.pdf__description:Report
|
||||||
|
if "__description" in value:
|
||||||
|
value = value.split("__")[0]
|
||||||
|
|
||||||
# Assume filename extension does not match
|
# Assume filename extension does not match
|
||||||
filename_extension_match = False
|
filename_extension_match = False
|
||||||
|
|
||||||
@ -312,8 +317,19 @@ def spdx_license_identifier(field):
|
|||||||
Prints the value if it is invalid.
|
Prints the value if it is invalid.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# List of common non-SPDX licenses to ignore
|
||||||
|
# See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
|
||||||
|
ignore_licenses = {
|
||||||
|
"All rights reserved; no re-use allowed",
|
||||||
|
"All rights reserved; self-archive copy only",
|
||||||
|
"Copyrighted; Non-commercial educational use only",
|
||||||
|
"Copyrighted; Non-commercial use only",
|
||||||
|
"Copyrighted; all rights reserved",
|
||||||
|
"Other",
|
||||||
|
}
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
if pd.isna(field):
|
if pd.isna(field) or field in ignore_licenses:
|
||||||
return
|
return
|
||||||
|
|
||||||
spdx_licenses = load_spdx_licenses()
|
spdx_licenses = load_spdx_licenses()
|
||||||
|
17
data/abstract-check.csv
Normal file
17
data/abstract-check.csv
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
id,dc.title,dcterms.abstract
|
||||||
|
1,Normal item,This is an abstract
|
||||||
|
2,Leading whitespace, This is an abstract
|
||||||
|
3,Trailing whitespace,This is an abstract
|
||||||
|
4,Consecutive whitespace,This is an abstract
|
||||||
|
5,Newline,"This
|
||||||
|
is an abstract"
|
||||||
|
6,Newline with leading whitespace," This
|
||||||
|
is an abstract"
|
||||||
|
7,Newline with trailing whitespace,"This
|
||||||
|
is an abstract "
|
||||||
|
8,Newline with consecutive whitespace,"This
|
||||||
|
is an abstract"
|
||||||
|
9,Multiple newlines,"This
|
||||||
|
is
|
||||||
|
an
|
||||||
|
abstract"
|
|
Loading…
Reference in New Issue
Block a user