1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-28 00:28:18 +01:00

Compare commits

..

7 Commits

Author SHA1 Message Date
8bc4cd419c
Strip filename descriptions before checking
Some checks failed
continuous-integration/drone/push Build is failing
When checking for uncommon file extensions in the filename field
we should strip descriptions that are meant for SAF Bundler, for
example: Annual_Report_2020.pdf__description:Report. This ends up
as a false positive that spams the output with warnings.
2023-02-13 11:00:57 +03:00
bde38e9ed4
CHANGELOG.md: add notes about abstracts 2023-02-13 10:39:03 +03:00
8db1e36a6d
csv_metadata_quality/app.py: skip abstract in separator check
Also skip abstract in the separator check, since it's rare to have
any "|" here, but more likely that if one is present then it's for
a reason.
2023-02-13 10:37:33 +03:00
fbb625be5c
Ignore common non-SPDX licenses
This is meant to catch licenses that are supposed to be SPDX but
aren't, not licenses that *aren't* supposed to be SPDX. We have so
many free-text license descriptions like "Copyrighted" and "Other"
that I'm sick of seeing warnings for them!
2023-02-07 17:01:56 +03:00
084b970798
CHANGELOG.md: add note about abstract field 2023-02-07 16:52:34 +03:00
171b35b015
Add data/abstract-check.csv
A test file with several whitespace and newline scenarios in the
abstract. I am currently disabling whitespace/newline fixes in the
abstract because they are too agressive.
2023-02-07 16:50:47 +03:00
545bb8cd0c
csv_metadata_quality/app.py: disable whitespace on abstracts
It's too aggressive on abstracts. If people paste in text from a
PDF there are often newlines, and most of the time this is what
they want.
2023-02-07 16:48:40 +03:00
4 changed files with 49 additions and 10 deletions

View File

@ -12,7 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Use SPDX license data from SPDX themselves instead of spdx-license-list - Use SPDX license data from SPDX themselves instead of spdx-license-list
because it is deprecated and outdated because it is deprecated and outdated
- Require Python 3.9+ - Require Python 3.9+
- Don't run `fix.separators()` on title fields - Don't run `fix.separators()` on title or abstract fields
- Don't run whitespace or newline fixes on abstract fields
- Ignore some common non-SPDX licenses
- Ignore `__description` suffix in filenames meant for SAFBuilder when checking
for uncommon file extensions
### Updated ### Updated
- Python dependencies - Python dependencies

View File

@ -90,12 +90,14 @@ def run(argv):
continue continue
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines
if args.unsafe_fixes: if args.unsafe_fixes:
df[column] = df[column].apply(fix.newlines, field_name=column) match = re.match(r"^.*?abstract.*$", column)
if match is None:
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines
df[column] = df[column].apply(fix.newlines, field_name=column)
# Fix: missing space after comma. Only run on author and citation # Fix: missing space after comma. Only run on author and citation
# fields for now, as this problem is mostly an issue in names. # fields for now, as this problem is mostly an issue in names.
@ -122,9 +124,9 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode) df[column] = df[column].apply(fix.unnecessary_unicode)
# Fix: invalid and unnecessary multi-value separators. Skip the title # Fix: invalid and unnecessary multi-value separators. Skip the title
# field because sometimes "|" is used to indicate something like a # and abstract fields because "|" is used to indicate something like
# subtitle. # a subtitle.
match = re.match(r"^.*?title.*$", column) match = re.match(r"^.*?(abstract|title).*$", column)
if match is None: if match is None:
df[column] = df[column].apply(fix.separators, field_name=column) df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators # Run whitespace fix again after fixing invalid separators

View File

@ -286,6 +286,11 @@ def filename_extension(field):
# Iterate over all values # Iterate over all values
for value in values: for value in values:
# Strip filename descriptions that are meant for SAF Bundler, for
# example: Annual_Report_2020.pdf__description:Report
if "__description" in value:
value = value.split("__")[0]
# Assume filename extension does not match # Assume filename extension does not match
filename_extension_match = False filename_extension_match = False
@ -312,8 +317,19 @@ def spdx_license_identifier(field):
Prints the value if it is invalid. Prints the value if it is invalid.
""" """
# List of common non-SPDX licenses to ignore
# See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
ignore_licenses = {
"All rights reserved; no re-use allowed",
"All rights reserved; self-archive copy only",
"Copyrighted; Non-commercial educational use only",
"Copyrighted; Non-commercial use only",
"Copyrighted; all rights reserved",
"Other",
}
# Skip fields with missing values # Skip fields with missing values
if pd.isna(field): if pd.isna(field) or field in ignore_licenses:
return return
spdx_licenses = load_spdx_licenses() spdx_licenses = load_spdx_licenses()

17
data/abstract-check.csv Normal file
View File

@ -0,0 +1,17 @@
id,dc.title,dcterms.abstract
1,Normal item,This is an abstract
2,Leading whitespace, This is an abstract
3,Trailing whitespace,This is an abstract
4,Consecutive whitespace,This is an abstract
5,Newline,"This
is an abstract"
6,Newline with leading whitespace," This
is an abstract"
7,Newline with trailing whitespace,"This
is an abstract "
8,Newline with consecutive whitespace,"This
is an abstract"
9,Multiple newlines,"This
is
an
abstract"
1 id dc.title dcterms.abstract
2 1 Normal item This is an abstract
3 2 Leading whitespace This is an abstract
4 3 Trailing whitespace This is an abstract
5 4 Consecutive whitespace This is an abstract
6 5 Newline This is an abstract
7 6 Newline with leading whitespace This is an abstract
8 7 Newline with trailing whitespace This is an abstract
9 8 Newline with consecutive whitespace This is an abstract
10 9 Multiple newlines This is an abstract