src/main/java: Refactor CountryCodeTagger.java

Now is much more modular and can easily, cleanly be extended to do
ISO 3166-1 Alpha3, numeric, etc...
This commit is contained in:
Alan Orth 2020-08-02 15:51:18 +03:00
parent a6d3653c9e
commit e5d45e62be
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
1 changed files with 111 additions and 83 deletions

View File

@ -37,121 +37,149 @@ import java.util.List;
public class CountryCodeTagger extends AbstractCurationTask public class CountryCodeTagger extends AbstractCurationTask
{ {
private int status = Curator.CURATE_UNSET; public class CountryCodeTaggerConfig {
private String result = null; private final String isocodesJsonPath = "/org/cgiar/cgspace/ctasks/iso_3166-1.json";
private final String cgspaceCountriesJsonPath = "/org/cgiar/cgspace/ctasks/cgspace-countries.json";
private final String iso3166Field = taskProperty("iso3166.field");
private final String iso3166Alpha2Field = taskProperty("iso3166-alpha2.field");
private final boolean forceupdate = taskBooleanProperty("forceupdate", false);
private static String isocodesJsonPath; private List<String> results = new ArrayList<String>();
private static String cgspaceCountriesJsonPath;
private static String iso3166Field;
private static String iso3166Alpha2Field;
private static boolean forceupdate;
private List<String> results = new ArrayList<String>(); private Logger log = Logger.getLogger(CountryCodeTagger.class);
}
private static Logger log = Logger.getLogger(CountryCodeTagger.class); public class CountryCodeTaggerResult {
private int status = Curator.CURATE_UNSET;
private String result = null;
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
public String getResult() {
return result;
}
public void setResult(String result) {
this.result = result;
}
}
@Override @Override
public int perform(DSpaceObject dso) throws IOException public int perform(DSpaceObject dso) throws IOException
{ {
// Load configuration // gotta define this here so we can access it after the if context...
isocodesJsonPath = "/org/cgiar/cgspace/ctasks/iso_3166-1.json"; CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult();
cgspaceCountriesJsonPath = "/org/cgiar/cgspace/ctasks/cgspace-countries.json";
iso3166Field = taskProperty("iso3166.field");
iso3166Alpha2Field = taskProperty("iso3166-alpha2.field");
forceupdate = taskBooleanProperty("forceupdate", false);
if (dso.getType() == Constants.ITEM) if (dso.getType() == Constants.ITEM)
{ {
// Load configuration
CountryCodeTaggerConfig config = new CountryCodeTaggerConfig();
Item item = (Item)dso; Item item = (Item)dso;
String itemHandle = item.getHandle();
Metadatum[] itemCountries = item.getMetadataByMetadataString(iso3166Field); alpha2Result = performAlpha2(item, config);
// skip items that don't have country metadata setResult(alpha2Result.getResult());
if (itemCountries.length == 0) { report(alpha2Result.getResult());
result = itemHandle + ": no countries, skipping."; }
status = Curator.CURATE_SKIP;
} else {
Gson gson = new Gson();
// TODO: convert to try: https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html return alpha2Result.getStatus();
BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(isocodesJsonPath))); }
ISO3166CountriesVocabulary isocodesCountriesJson = gson.fromJson(reader, ISO3166CountriesVocabulary.class);
reader.close();
reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(cgspaceCountriesJsonPath))); public CountryCodeTaggerResult performAlpha2(Item item, CountryCodeTaggerConfig config) throws IOException
CGSpaceCountriesVocabulary cgspaceCountriesJson = gson.fromJson(reader, CGSpaceCountriesVocabulary.class); {
reader.close(); CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult();
String itemHandle = item.getHandle();
//System.out.println(itemHandle + ": " + itemCountries.length + " countries possibly need tagging"); Metadatum[] itemCountries = item.getMetadataByMetadataString(config.iso3166Field);
// split the alpha2 country code field into schema, element, and qualifier so we can use it with item.addMetadata() // skip items that don't have country metadata
String[] iso3166Alpha2FieldParts = iso3166Alpha2Field.split("\\."); if (itemCountries.length == 0) {
alpha2Result.setResult(itemHandle + ": no countries, skipping.");
alpha2Result.setStatus(Curator.CURATE_SKIP);
} else {
Gson gson = new Gson();
if (forceupdate) { // TODO: convert to try: https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
item.clearMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], Item.ANY); BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(config.isocodesJsonPath)));
} ISO3166CountriesVocabulary isocodesCountriesJson = gson.fromJson(reader, ISO3166CountriesVocabulary.class);
reader.close();
// check the item's country codes, if any reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(config.cgspaceCountriesJsonPath)));
Metadatum[] itemAlpha2CountryCodes = item.getMetadataByMetadataString(iso3166Alpha2Field); CGSpaceCountriesVocabulary cgspaceCountriesJson = gson.fromJson(reader, CGSpaceCountriesVocabulary.class);
reader.close();
if (itemAlpha2CountryCodes.length == 0) { //System.out.println(itemHandle + ": " + itemCountries.length + " countries possibly need tagging");
//System.out.println(itemHandle + ": Should add codes for " + itemCountries.length + " countries.");
Integer addedCodeCount = 0; // split the alpha2 country code field into schema, element, and qualifier so we can use it with item.addMetadata()
for (Metadatum itemCountry : itemCountries) { String[] iso3166Alpha2FieldParts = config.iso3166Alpha2Field.split("\\.");
//check ISO 3166-1 countries
for (CountriesVocabulary.Country country : isocodesCountriesJson.countries) {
if (itemCountry.value.equalsIgnoreCase(country.getName()) || itemCountry.value.equalsIgnoreCase(country.get_official_name()) || itemCountry.value.equalsIgnoreCase(country.get_common_name())) {
System.out.println(itemHandle + ": adding country code " + country.getAlpha_2());
try { if (config.forceupdate) {
item.addMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], "en_US", country.getAlpha_2()); item.clearMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], Item.ANY);
item.update(); }
addedCodeCount++; // check the item's country codes, if any
Metadatum[] itemAlpha2CountryCodes = item.getMetadataByMetadataString(config.iso3166Alpha2Field);
result = itemHandle + ": added " + addedCodeCount + " country code(s)"; if (itemAlpha2CountryCodes.length == 0) {
status = Curator.CURATE_SUCCESS; //System.out.println(itemHandle + ": Should add codes for " + itemCountries.length + " countries.");
} catch (SQLException | AuthorizeException sqle) {
log.debug(sqle.getMessage());
result = itemHandle + ": error";
status = Curator.CURATE_ERROR;
}
}
}
//check CGSpace countries
for (CountriesVocabulary.Country country : cgspaceCountriesJson.countries) {
if (itemCountry.value.equalsIgnoreCase(country.getCgspace_name())) {
System.out.println(itemHandle + ": adding country code " + country.getAlpha_2());
try { int addedCodeCount = 0;
// we have the field as a string, so we need to split/tokenize it here actually for (Metadatum itemCountry : itemCountries) {
item.addMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], "en_US", country.getAlpha_2()); //check ISO 3166-1 countries
item.update(); for (CountriesVocabulary.Country country : isocodesCountriesJson.countries) {
if (itemCountry.value.equalsIgnoreCase(country.getName()) || itemCountry.value.equalsIgnoreCase(country.get_official_name()) || itemCountry.value.equalsIgnoreCase(country.get_common_name())) {
System.out.println(itemHandle + ": adding country code " + country.getAlpha_2());
addedCodeCount++; try {
item.addMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], "en_US", country.getAlpha_2());
item.update();
result = itemHandle + ": added " + addedCodeCount + " country code(s)"; addedCodeCount++;
status = Curator.CURATE_SUCCESS;
} catch (SQLException | AuthorizeException sqle) { alpha2Result.setResult(itemHandle + ": added " + addedCodeCount + " country code(s)");
log.debug(sqle.getMessage()); alpha2Result.setStatus(Curator.CURATE_SUCCESS);
result = itemHandle + ": error"; } catch (SQLException | AuthorizeException sqle) {
status = Curator.CURATE_ERROR; config.log.debug(sqle.getMessage());
} alpha2Result.setResult(itemHandle + ": error");
alpha2Result.setStatus(Curator.CURATE_ERROR);
}
}
}
//check CGSpace countries
for (CountriesVocabulary.Country country : cgspaceCountriesJson.countries) {
if (itemCountry.value.equalsIgnoreCase(country.getCgspace_name())) {
System.out.println(itemHandle + ": adding country code " + country.getAlpha_2());
try {
// we have the field as a string, so we need to split/tokenize it here actually
item.addMetadata(iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], "en_US", country.getAlpha_2());
item.update();
addedCodeCount++;
alpha2Result.setResult(itemHandle + ": added " + addedCodeCount + " country code(s)");
alpha2Result.setStatus(Curator.CURATE_SUCCESS);
} catch (SQLException | AuthorizeException sqle) {
config.log.debug(sqle.getMessage());
alpha2Result.setResult(itemHandle + ": error");
alpha2Result.setStatus(Curator.CURATE_ERROR);
} }
} }
} }
} else {
result = itemHandle + ": item has country codes, skipping";
status = Curator.CURATE_SKIP;
} }
} else {
alpha2Result.setResult(itemHandle + ": item has country codes, skipping");
alpha2Result.setStatus(Curator.CURATE_SKIP);
} }
}
setResult(result); return alpha2Result;
report(result);
}
return status;
} }
} }