1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 05:45:02 +01:00

Ignore subregion field for missing region checks
All checks were successful
continuous-integration/drone/push Build is passing

Due to a sloppy regex I was sometimes matching the subregion field
when checking for missing UN M.49 regions in the region field.
This commit is contained in:
Alan Orth 2022-12-07 23:18:47 +01:00
parent 58e956360a
commit 051777bcec
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 46 additions and 41 deletions

View File

@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
### Fixed
- Missing region check should ignore subregion field, if it exists
## [0.6.0] - 2022-09-02 ## [0.6.0] - 2022-09-02
### Changed ### Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding - Perform fix for "unnecessary" Unicode characters after we try to fix encoding

View File

@ -512,9 +512,9 @@ def countries_match_regions(row, exclude):
if match is not None: if match is not None:
country_column_name = label country_column_name = label
# Find the name of the region column # Find the name of the region column, but make sure it's not subregion!
match = re.match(r"^.*?region.*$", label) match = re.match(r"^.*?region.*$", label)
if match is not None: if match is not None and "sub" not in label:
region_column_name = label region_column_name = label
# Find the name of the title column # Find the name of the title column

View File

@ -327,9 +327,9 @@ def countries_match_regions(row, exclude):
if match is not None: if match is not None:
country_column_name = label country_column_name = label
# Find the name of the region column # Find the name of the region column, but make sure it's not subregion!
match = re.match(r"^.*?region.*$", label) match = re.match(r"^.*?region.*$", label)
if match is not None: if match is not None and "sub" not in label:
region_column_name = label region_column_name = label
# Find the name of the title column # Find the name of the title column

View File

@ -1,38 +1,39 @@
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region,cg.coverage.subregion
Leading space,2019-07-29,,,,,,,,,,, Leading space,2019-07-29,,,,,,,,,,,,
Trailing space ,2019-07-29,,,,,,,,,,, Trailing space ,2019-07-29,,,,,,,,,,,,
Excessive space,2019-07-29,,,,,,,,,,, Excessive space,2019-07-29,,,,,,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,, Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,,,,, Duplicate||Duplicate,2019-07-29,,,,,,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,, Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,, Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,, Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,, Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,,,
Invalid date,2019-07-260,,,,,,,,,,, Invalid date,2019-07-260,,,,,,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,, Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,, Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,,,,, Unnecessary Unicode,2019-07-29,,,,,,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,, Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,, Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,, Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,, Invalid language,2019-07-29,,,Span,,,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,, Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,,
Newline (LF),2019-07-30,,,,"TANZA Newline (LF),2019-07-30,,,,"TANZA
NIA",,,,,,, NIA",,,,,,,,
Missing date,,,,,,,,,,,, Missing date,,,,,,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,,,,, Invalid country,2019-08-01,,,,,KENYAA,,,,,,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,, Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,,,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,, Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,,,
"Missing space,after comma",2019-08-27,,,,,,,,,,, "Missing space,after comma",2019-08-27,,,,,,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,, Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,, Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,,,
Composéd Unicode,2020-01-14,,,,,,,,,,, Composéd Unicode,2020-01-14,,,,,,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,,,,, Decomposéd Unicode,2020-01-14,,,,,,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,, Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,, Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,, Duplicate Title,2021-03-17,,,,,,,,Report,,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,, Duplicate Title,2021-03-17,,,,,,,,Report,,,,
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,, Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",, "DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",,,
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",, Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
Country missing region,2021-12-08,,,,,Kenya,,,,,, Country missing region,2021-12-08,,,,,Kenya,,,,,,,
Subregion field shouldnt trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region cg.coverage.subregion
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 2019-07-29 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 LIVESTOCK||FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA
23 Uncommon filename extension 2019-08-10 file.pdf.lck
24 Unneccesary unicode (U+002D + U+00AD) 2019-08-10 978-­92-­9043-­823-­6
25 Missing space,after comma 2019-08-27
26 Incorrect ISO 639-1 language 2019-09-26 es
27 Incorrect ISO 639-3 language 2019-09-26 spa
28 Composéd Unicode 2020-01-14
29 Decomposéd Unicode 2020-01-14
30 Unnecessary multi-value separator 2021-01-03 0378-5955||
31 Invalid SPDX license identifier 2021-03-11 CC-BY
32 Duplicate Title 2021-03-17 Report
33 Duplicate Title 2021-03-17 Report
34 Mojibake 2021-03-18 Publicaçao CIAT Report
35 DOI in citation, but missing cg.identifier.doi 2021-10-06 Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218
36 Title missing from citation 2021-12-05 Orth, A. 2021. Title missing f rom citation.
37 Country missing region 2021-12-08 Kenya
38 Subregion field shouldn’t trigger region checks 2022-12-07 Kenya Eastern Africa Baringo
39