1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 04:02:19 +01:00

Ignore subregion field for missing region checks
All checks were successful
continuous-integration/drone/push Build is passing

Due to a sloppy regex I was sometimes matching the subregion field
when checking for missing UN M.49 regions in the region field.
This commit is contained in:
Alan Orth 2022-12-07 23:18:47 +01:00
parent 58e956360a
commit 051777bcec
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 46 additions and 41 deletions

View File

@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
### Fixed
- Missing region check should ignore subregion field, if it exists
## [0.6.0] - 2022-09-02
### Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding

View File

@ -512,9 +512,9 @@ def countries_match_regions(row, exclude):
if match is not None:
country_column_name = label
# Find the name of the region column
# Find the name of the region column, but make sure it's not subregion!
match = re.match(r"^.*?region.*$", label)
if match is not None:
if match is not None and "sub" not in label:
region_column_name = label
# Find the name of the title column

View File

@ -327,9 +327,9 @@ def countries_match_regions(row, exclude):
if match is not None:
country_column_name = label
# Find the name of the region column
# Find the name of the region column, but make sure it's not subregion!
match = re.match(r"^.*?region.*$", label)
if match is not None:
if match is not None and "sub" not in label:
region_column_name = label
# Find the name of the title column

View File

@ -1,38 +1,39 @@
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region
Leading space,2019-07-29,,,,,,,,,,,
Trailing space ,2019-07-29,,,,,,,,,,,
Excessive space,2019-07-29,,,,,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,,
Invalid date,2019-07-260,,,,,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi,cg.coverage.region,cg.coverage.subregion
Leading space,2019-07-29,,,,,,,,,,,,
Trailing space ,2019-07-29,,,,,,,,,,,,
Excessive space,2019-07-29,,,,,,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,,,
Invalid date,2019-07-260,,,,,,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,,
Newline (LF),2019-07-30,,,,"TANZA
NIA",,,,,,,
Missing date,,,,,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,,,,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,,
"Missing space,after comma",2019-08-27,,,,,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,,
Composéd Unicode,2020-01-14,,,,,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,,
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",,
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,
Country missing region,2021-12-08,,,,,Kenya,,,,,,
NIA",,,,,,,,
Missing date,,,,,,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,,,,,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,,,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,,,
"Missing space,after comma",2019-08-27,,,,,,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,,,
Composéd Unicode,2020-01-14,,,,,,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,,,
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218",,,
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
Subregion field shouldnt trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region cg.coverage.subregion
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 2019-07-29 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 LIVESTOCK||FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA
23 Uncommon filename extension 2019-08-10 file.pdf.lck
24 Unneccesary unicode (U+002D + U+00AD) 2019-08-10 978-­92-­9043-­823-­6
25 Missing space,after comma 2019-08-27
26 Incorrect ISO 639-1 language 2019-09-26 es
27 Incorrect ISO 639-3 language 2019-09-26 spa
28 Composéd Unicode 2020-01-14
29 Decomposéd Unicode 2020-01-14
30 Unnecessary multi-value separator 2021-01-03 0378-5955||
31 Invalid SPDX license identifier 2021-03-11 CC-BY
32 Duplicate Title 2021-03-17 Report
33 Duplicate Title 2021-03-17 Report
34 Mojibake 2021-03-18 Publicaçao CIAT Report
35 DOI in citation, but missing cg.identifier.doi 2021-10-06 Orth, A. 2021. DOI in citation, but missing cg.identifier.doi. doi: 10.1186/1743-422X-9-218
36 Title missing from citation 2021-12-05 Orth, A. 2021. Title missing f rom citation.
37 Country missing region 2021-12-08 Kenya
38 Subregion field shouldn’t trigger region checks 2022-12-07 Kenya Eastern Africa Baringo
39