diff --git a/CHANGELOG.md b/CHANGELOG.md index 3558f23..73d231c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased +### Fixed +- Missing region check should ignore subregion field, if it exists + ## [0.6.0] - 2022-09-02 ### Changed - Perform fix for "unnecessary" Unicode characters after we try to fix encoding diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index cd7ed72..633985d 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -512,9 +512,9 @@ def countries_match_regions(row, exclude): if match is not None: country_column_name = label - # Find the name of the region column + # Find the name of the region column, but make sure it's not subregion! match = re.match(r"^.*?region.*$", label) - if match is not None: + if match is not None and "sub" not in label: region_column_name = label # Find the name of the title column diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 3437fa9..cf948ed 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -327,9 +327,9 @@ def countries_match_regions(row, exclude): if match is not None: country_column_name = label - # Find the name of the region column + # Find the name of the region column, but make sure it's not subregion! match = re.match(r"^.*?region.*$", label) - if match is not None: + if match is not None and "sub" not in label: region_column_name = label # Find the name of the title column diff --git a/data/test.csv b/data/test.csv index 0718ca6..119fbbf 100644 --- a/data/test.csv +++ b/data/test.csv @@ -1,38 +1,39 @@ -dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region - Leading space,2019-07-29,,,,,,,,,,, -Trailing space ,2019-07-29,,,,,,,,,,, -Excessive space,2019-07-29,,,,,,,,,,, -Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,, -Duplicate||Duplicate,2019-07-29,,,,,,,,,,, -Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,, -Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,, -Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,, -Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,, -Invalid date,2019-07-260,,,,,,,,,,, -Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,, -Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,, -Unnecessary Unicode​,2019-07-29,,,,,,,,,,, -Suspicious character||foreˆt,2019-07-29,,,,,,,,,,, -Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,, -Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,, -Invalid language,2019-07-29,,,Span,,,,,,,, -Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,, +dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region,cg.coverage.subregion + Leading space,2019-07-29,,,,,,,,,,,, +Trailing space ,2019-07-29,,,,,,,,,,,, +Excessive space,2019-07-29,,,,,,,,,,,, +Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,,, +Duplicate||Duplicate,2019-07-29,,,,,,,,,,,, +Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,,, +Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,,, +Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,,, +Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,,, +Invalid date,2019-07-260,,,,,,,,,,,, +Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,,, +Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,,, +Unnecessary Unicode​,2019-07-29,,,,,,,,,,,, +Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,, +Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,, +Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,, +Invalid language,2019-07-29,,,Span,,,,,,,,, +Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,, Newline (LF),2019-07-30,,,,"TANZA -NIA",,,,,,, -Missing date,,,,,,,,,,,, -Invalid country,2019-08-01,,,,,KENYAA,,,,,, -Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,, -Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,, -"Missing space,after comma",2019-08-27,,,,,,,,,,, -Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,, -Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,, -Composéd Unicode,2020-01-14,,,,,,,,,,, -Decomposéd Unicode,2020-01-14,,,,,,,,,,, -Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,, -Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,, -Duplicate Title,2021-03-17,,,,,,,,Report,,, -Duplicate Title,2021-03-17,,,,,,,,Report,,, -Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,, -"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",, -Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",, -Country missing region,2021-12-08,,,,,Kenya,,,,,, +NIA",,,,,,,, +Missing date,,,,,,,,,,,,, +Invalid country,2019-08-01,,,,,KENYAA,,,,,,, +Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,,, +Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,,, +"Missing space,after comma",2019-08-27,,,,,,,,,,,, +Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,,, +Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,,, +Composéd Unicode,2020-01-14,,,,,,,,,,,, +Decomposéd Unicode,2020-01-14,,,,,,,,,,,, +Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,,, +Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,,, +Duplicate Title,2021-03-17,,,,,,,,Report,,,, +Duplicate Title,2021-03-17,,,,,,,,Report,,,, +Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,, +"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",,, +Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,, +Country missing region,2021-12-08,,,,,Kenya,,,,,,, +Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo