diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 36d8aa7..5c01096 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -205,14 +205,23 @@ def run(argv): # Check: title in citation check.title_in_citation(df_transposed[column]) - # Check: countries match regions - check.countries_match_regions(df_transposed[column]) + if args.unsafe_fixes: + # Fix: countries match regions + df_transposed[column] = fix.countries_match_regions(df_transposed[column]) + else: + # Check: countries match regions + check.countries_match_regions(df_transposed[column]) if args.experimental_checks: experimental.correct_language(df_transposed[column]) + # Transpose the DataFrame back before writing. This is probably wasteful to + # do every time since we technically only need to do it if we've done the + # countries/regions fix above, but I can't think of another way for now. + df_transposed_back = df_transposed.T + # Write - df.to_csv(args.output_file, index=False) + df_transposed_back.to_csv(args.output_file, index=False) # Close the input and output files before exiting args.input_file.close() diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 066dc68..91b0beb 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -3,6 +3,7 @@ import re from unicodedata import normalize +import country_converter as coco import pandas as pd from colorama import Fore from ftfy import TextFixerConfig, fix_text @@ -289,3 +290,83 @@ def mojibake(field, field_name): return fix_text(field, config) else: return field + + +def countries_match_regions(row): + """Check for the scenario where an item has country coverage metadata, but + does not have the corresponding region metadata. For example, an item that + has country coverage "Kenya" should also have region "Eastern Africa" acc- + ording to the UN M.49 classification scheme. + + See: https://unstats.un.org/unsd/methodology/m49/ + + Return fixed string. + """ + # Initialize some variables at global scope so that we can set them in the + # loop scope below and still be able to access them afterwards. + country_column_name = "" + region_column_name = "" + title_column_name = "" + + # Iterate over the labels of the current row's values to get the names of + # the title and citation columns. Then we check if the title is present in + # the citation. + for label in row.axes[0]: + # Find the name of the country column + match = re.match(r"^.*?country.*$", label) + if match is not None: + country_column_name = label + + # Find the name of the region column + match = re.match(r"^.*?region.*$", label) + if match is not None: + region_column_name = label + + # Find the name of the title column + match = re.match(r"^(dc|dcterms)\.title.*$", label) + if match is not None: + title_column_name = label + + # Make sure we found the country and region columns + if country_column_name != "" and region_column_name != "": + # If we don't have any countries then we should return early before + # suggesting regions. + if row[country_column_name] is not None: + countries = row[country_column_name].split("||") + else: + return + + if row[region_column_name] is not None: + regions = row[region_column_name].split("||") + else: + regions = list() + + # An empty list for our regions so we can keep track for all countries + missing_regions = list() + + for country in countries: + # Look up the UN M.49 regions for this country code. CoCo seems to + # only list the direct region, ie Western Africa, rather than all + # the parent regions ("Sub-Saharan Africa", "Africa", "World") + un_region = coco.convert(names=country, to="UNRegion") + + if un_region not in regions: + if un_region not in missing_regions: + missing_regions.append(un_region) + + if len(missing_regions) > 0: + for missing_region in missing_regions: + print( + f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}" + ) + + # Add the missing regions back to the row, paying attention to whether + # or not the row's regions are blank or not. + if row[region_column_name] is not None: + row[region_column_name] = row[region_column_name] + "||".join( + missing_regions + ) + else: + row[region_column_name] = "||".join(missing_regions) + + return row diff --git a/tests/test_fix.py b/tests/test_fix.py index ad7ec42..5ff3e6c 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-3.0-only +import pandas as pd + import csv_metadata_quality.fix as fix @@ -120,3 +122,32 @@ def test_fix_mojibake(): field_name = "dcterms.isPartOf" assert fix.mojibake(field, field_name) == "CIAT PublicaƧao" + + +def test_fix_country_not_matching_region(): + """Test an item with regions not matching its country list.""" + + title = "Testing an item with no matching region." + country = "Kenya" + region = "" + missing_region = "Eastern Africa" + + # Emulate a column in a transposed dataframe (which is just a series) + d = { + "dc.title": title, + "cg.coverage.country": country, + "cg.coverage.region": region, + } + series = pd.Series(data=d) + + result = fix.countries_match_regions(series) + + # Emulate the correct series we are expecting + d_correct = { + "dc.title": title, + "cg.coverage.country": country, + "cg.coverage.region": missing_region, + } + series_correct = pd.Series(data=d_correct) + + pd.testing.assert_series_equal(result, series_correct)