2024-11-28 08:38:18 +01:00
4 changed files with 30 additions and 67 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -15,12 +15,8 @@ steps:
  - python setup.py install
  # Basic test
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
-  # Basic test with unsafe fixes
+  # Test with unsafe fixes
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
-  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
-  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
@ -45,12 +41,8 @@ steps:
  - python setup.py install
  # Basic test
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
-  # Basic test with unsafe fixes
+  # Test with unsafe fixes
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
-  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
-  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
@ -75,12 +67,8 @@ steps:
  - python setup.py install
  # Basic test
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
-  # Basic test with unsafe fixes
+  # Test with unsafe fixes
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
-  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
-  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-3.0-only

-import logging
 import os
 import re
 from datetime import datetime, timedelta
@ -218,7 +217,7 @@ def agrovoc(field, field_name, drop):
    )

    # prune old cache entries
-    # requests_cache.remove_expired_responses()
+    requests_cache.remove_expired_responses()

    # Initialize an empty list to hold the validated AGROVOC values
    values = list()
@ -486,15 +485,6 @@ def countries_match_regions(row):
    region_column_name = ""
    title_column_name = ""

-    # Instantiate a CountryConverter() object here. According to the docs it is
-    # more performant to do that as opposed to calling coco.convert() directly
-    # because we don't need to re-load the country data with each iteration.
-    cc = coco.CountryConverter()
-
-    # Set logging to ERROR so country_converter's convert() doesn't print the
-    # "not found in regex" warning message to the screen.
-    logging.basicConfig(level=logging.ERROR)
-
    # Iterate over the labels of the current row's values to get the names of
    # the title and citation columns. Then we check if the title is present in
    # the citation.
@ -528,15 +518,23 @@ def countries_match_regions(row):
        else:
            regions = list()

+        # An empty list for our regions so we can keep track for all countries
+        missing_regions = list()
+
        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
            # only list the direct region, ie Western Africa, rather than all
            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
-            un_region = cc.convert(names=country, to="UNRegion")
+            un_region = coco.convert(names=country, to="UNRegion")

-            if un_region != "not found" and un_region not in regions:
+            if un_region not in regions:
+                if un_region not in missing_regions:
+                    missing_regions.append(un_region)
+
+        if len(missing_regions) > 0:
+            for missing_region in missing_regions:
                print(
-                    f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
+                    f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
                )

    return
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-3.0-only

-import logging
 import re
 from unicodedata import normalize

@ -309,15 +308,6 @@ def countries_match_regions(row):
    region_column_name = ""
    title_column_name = ""

-    # Instantiate a CountryConverter() object here. According to the docs it is
-    # more performant to do that as opposed to calling coco.convert() directly
-    # because we don't need to re-load the country data with each iteration.
-    cc = coco.CountryConverter()
-
-    # Set logging to ERROR so country_converter's convert() doesn't print the
-    # "not found in regex" warning message to the screen.
-    logging.basicConfig(level=logging.ERROR)
-
    # Iterate over the labels of the current row's values to get the names of
    # the title and citation columns. Then we check if the title is present in
    # the citation.
@ -358,18 +348,18 @@ def countries_match_regions(row):
            # Look up the UN M.49 regions for this country code. CoCo seems to
            # only list the direct region, ie Western Africa, rather than all
            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
-            un_region = cc.convert(names=country, to="UNRegion")
+            un_region = coco.convert(names=country, to="UNRegion")

-            # Add the new un_region to regions if it is not "not found" and if
-            # it doesn't already exist in regions.
-            if un_region != "not found" and un_region not in regions:
+            if un_region not in regions:
                if un_region not in missing_regions:
-                    print(
-                        f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
-                    )
                    missing_regions.append(un_region)

        if len(missing_regions) > 0:
+            for missing_region in missing_regions:
+                print(
+                    f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
+                )
+
        # Add the missing regions back to the row, paying attention to whether
        # or not the row's region column is None (aka null) or just an empty
        # string (length would be 0).
--- a/data/test-geography.csv
+++ b/data/test-geography.csv
@ -1,13 +0,0 @@
-dc.title,dcterms.issued,dcterms.type,dc.contributor.author,cg.coverage.country,cg.coverage.region
-No country,2022-09-01,Report,"Orth, Alan",,
-Matching country and region,2022-09-01,Report,"Orth, Alan",Kenya,Eastern Africa
-Missing region,2022-09-01,Report,"Orth, Alan",Kenya,
-Caribbean country with matching region,2022-09-01,Report,"Orth, Alan",Bahamas,Caribbean
-Caribbean country with no region,2022-09-01,Report,"Orth, Alan",Bahamas,
-Fake country with no region,2022-09-01,Report,"Orth, Alan",Yeah Baby,
-SE Asian country with matching region,2022-09-01,Report,"Orth, Alan",Cambodia,South-eastern Asia
-SE Asian country with no region,2022-09-01,Report,"Orth, Alan",Cambodia,
-Duplicate countries with matching region,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,Eastern Africa
-Duplicate countries with missing regions,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,
-Multiple countries with no regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,
-Multiple countries with mixed matching regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,Eastern Africa