mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-10 07:06:00 +02:00
Compare commits
12 Commits
Author | SHA1 | Date | |
---|---|---|---|
36d0474b95
|
|||
efdc3a841a
|
|||
fd2ba6845d
|
|||
e55380b4d5
|
|||
85ae16d9b7
|
|||
c42f8b4812
|
|||
1c75608d54
|
|||
0b15a8ed3b
|
|||
9ca266f5f0
|
|||
0d3f948708
|
|||
c04207fcfc
|
|||
9d4eceddc7
|
@ -13,7 +13,7 @@ tasks:
|
||||
- testcli: |
|
||||
cd csv-metadata-quality
|
||||
pipenv run pip install .
|
||||
pipenv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u --agrovoc-fields dc.subject,cg.coverage.country
|
||||
pipenv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e -u --agrovoc-fields dc.subject,cg.coverage.country
|
||||
environment:
|
||||
PIPENV_NOSPIN: 'True'
|
||||
PIPENV_HIDE_EMOJIS: 'True'
|
||||
|
@ -4,14 +4,19 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.3.1] - 2019-10-01
|
||||
## Changed
|
||||
- Replace non-breaking spaces (U+00A0) with space instead of removing them
|
||||
- Harmonize language of script output when fixing various issues
|
||||
|
||||
## [0.3.0] - 2019-09-26
|
||||
### Updated
|
||||
- Update python dependencies to latest versions, including numpy 1.17.2, pandas
|
||||
0.25.1, pytest 5.1.3, and requests-cache 0.5.2
|
||||
|
||||
## Added
|
||||
### Added
|
||||
- csvkit to dev requirements (csvcut etc are useful during development)
|
||||
- Experimental language validation using `-e` (see README.md)
|
||||
- Experimental language validation using the Python `langid` library (enable with `-e`, see README.md)
|
||||
|
||||
### Changed
|
||||
- Re-formatted code with black and isort
|
||||
|
@ -1,5 +1,5 @@
|
||||
# CSV Metadata Quality [](https://travis-ci.org/ilri/csv-metadata-quality) [](https://builds.sr.ht/~alanorth/csv-metadata-quality?)
|
||||
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem. The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, etc.
|
||||
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, etc.
|
||||
|
||||
Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
||||
|
||||
@ -92,7 +92,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add check for author names with incorrect spacing after commas, ie "Orth,Alan S."
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
|
||||
## License
|
||||
|
@ -26,7 +26,7 @@ def whitespace(field):
|
||||
match = re.findall(pattern, value)
|
||||
|
||||
if match:
|
||||
print(f"Excessive whitespace: {value}")
|
||||
print(f"Removing excessive whitespace: {value}")
|
||||
value = re.sub(pattern, " ", value)
|
||||
|
||||
# Save cleaned value
|
||||
@ -74,10 +74,10 @@ def unnecessary_unicode(field):
|
||||
Removes unnecessary Unicode characters like:
|
||||
- Zero-width space (U+200B)
|
||||
- Replacement character (U+FFFD)
|
||||
- No-break space (U+00A0)
|
||||
|
||||
Replaces unnecessary Unicode characters like:
|
||||
- Soft hyphen (U+00AD) → hyphen
|
||||
- No-break space (U+00A0) → space
|
||||
|
||||
Return string with characters removed or replaced.
|
||||
"""
|
||||
@ -107,8 +107,8 @@ def unnecessary_unicode(field):
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f"Removing unnecessary Unicode (U+00A0): {field}")
|
||||
field = re.sub(pattern, "", field)
|
||||
print(f"Replacing unnecessary Unicode (U+00A0): {field}")
|
||||
field = re.sub(pattern, " ", field)
|
||||
|
||||
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||
pattern = re.compile(r"\u002D*?\u00AD")
|
||||
@ -140,7 +140,7 @@ def duplicates(field):
|
||||
if value not in new_values:
|
||||
new_values.append(value)
|
||||
else:
|
||||
print(f"Dropping duplicate value: {value}")
|
||||
print(f"Removing duplicate value: {value}")
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = "||".join(new_values)
|
||||
|
@ -1 +1 @@
|
||||
VERSION = "0.3.0"
|
||||
VERSION = "0.3.1"
|
||||
|
@ -1,4 +1,4 @@
|
||||
dc.title,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
dc.title,dc.date.issued,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
Leading space,2019-07-29,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,
|
||||
|
|
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.3.0",
|
||||
version="0.3.1",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
Reference in New Issue
Block a user