Compare commits

...

4 Commits

Author SHA1 Message Date
b396fba043
src: format Java files with google-java-format
Using AOSP format so we get four spaces instead of two.
2022-10-06 14:27:51 +03:00
38a9cc5188
src: organize imports in VS Code 2022-10-06 14:26:44 +03:00
16db38967b
src: handle null descriptions in FixJpgJpgThumbnails 2022-10-06 14:17:41 +03:00
2604dc3cce
src: skip Infographics and Maps in FixJpgJpgThumbnails
Instead of checking whether they exist and then skipping them just
at the moment when we want to swap the bitstreams let's bail early
when we know an item is an Infographic or a Map.
2022-10-06 14:15:58 +03:00
6 changed files with 252 additions and 178 deletions

View File

@ -1,8 +1,8 @@
/* /*
* Copyright (C) 2020 Alan Orth * Copyright (C) 2020 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.ctasks; package io.github.ilri.cgspace.ctasks;

View File

@ -1,8 +1,8 @@
/* /*
* Copyright (C) 2020 Alan Orth * Copyright (C) 2020 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.ctasks; package io.github.ilri.cgspace.ctasks;
@ -11,15 +11,16 @@ import javax.annotation.Nullable;
public class CountriesVocabulary { public class CountriesVocabulary {
class Country { class Country {
private String name; //required private String name; // required
private String common_name; //optional private String common_name; // optional
private String official_name; //optional private String official_name; // optional
private String cgspace_name; //optional private String cgspace_name; // optional
private String numeric; //required Hmmmm need to cast this... private String numeric; // required Hmmmm need to cast this...
private String alpha_2; //required private String alpha_2; // required
private String alpha_3; //required private String alpha_3; // required
public Country(String name, public Country(
String name,
@Nullable String common_name, @Nullable String common_name,
@Nullable String official_name, @Nullable String official_name,
@Nullable String cgspace_name, @Nullable String cgspace_name,
@ -30,7 +31,9 @@ public class CountriesVocabulary {
this.common_name = common_name; this.common_name = common_name;
this.official_name = official_name; this.official_name = official_name;
this.cgspace_name = cgspace_name; this.cgspace_name = cgspace_name;
this.numeric = numeric; // fuuuuu this is a string and we can't cast to Integer because some values are zeropadded like "004" this.numeric =
numeric; // fuuuuu this is a string and we can't cast to Integer because some
// values are zeropadded like "004"
this.alpha_2 = alpha_2; this.alpha_2 = alpha_2;
this.alpha_3 = alpha_3; this.alpha_3 = alpha_3;
} }

View File

@ -1,12 +1,13 @@
/* /*
* Copyright (C) 2020 Alan Orth * Copyright (C) 2020 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.ctasks; package io.github.ilri.cgspace.ctasks;
import com.google.gson.Gson; import com.google.gson.Gson;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.AuthorizeException;
import org.dspace.content.DSpaceObject; import org.dspace.content.DSpaceObject;
@ -23,11 +24,11 @@ import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
public class CountryCodeTagger extends AbstractCurationTask public class CountryCodeTagger extends AbstractCurationTask {
{
public class CountryCodeTaggerConfig { public class CountryCodeTaggerConfig {
private final String isocodesJsonPath = "/io/github/ilri/cgspace/ctasks/iso_3166-1.json"; private final String isocodesJsonPath = "/io/github/ilri/cgspace/ctasks/iso_3166-1.json";
private final String cgspaceCountriesJsonPath = "/io/github/ilri/cgspace/ctasks/cgspace-countries.json"; private final String cgspaceCountriesJsonPath =
"/io/github/ilri/cgspace/ctasks/cgspace-countries.json";
private final String iso3166Field = taskProperty("iso3166.field"); private final String iso3166Field = taskProperty("iso3166.field");
private final String iso3166Alpha2Field = taskProperty("iso3166-alpha2.field"); private final String iso3166Alpha2Field = taskProperty("iso3166-alpha2.field");
private final boolean forceupdate = taskBooleanProperty("forceupdate", false); private final boolean forceupdate = taskBooleanProperty("forceupdate", false);
@ -57,17 +58,15 @@ public class CountryCodeTagger extends AbstractCurationTask
} }
@Override @Override
public int perform(DSpaceObject dso) throws IOException public int perform(DSpaceObject dso) throws IOException {
{
// gotta define this here so we can access it after the if context... // gotta define this here so we can access it after the if context...
CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult(); CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult();
if (dso.getType() == Constants.ITEM) if (dso.getType() == Constants.ITEM) {
{
// Load configuration // Load configuration
CountryCodeTaggerConfig config = new CountryCodeTaggerConfig(); CountryCodeTaggerConfig config = new CountryCodeTaggerConfig();
Item item = (Item)dso; Item item = (Item) dso;
try { try {
alpha2Result = performAlpha2(item, config); alpha2Result = performAlpha2(item, config);
@ -82,11 +81,13 @@ public class CountryCodeTagger extends AbstractCurationTask
return alpha2Result.getStatus(); return alpha2Result.getStatus();
} }
public CountryCodeTaggerResult performAlpha2(Item item, CountryCodeTaggerConfig config) throws IOException, SQLException { public CountryCodeTaggerResult performAlpha2(Item item, CountryCodeTaggerConfig config)
throws IOException, SQLException {
CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult(); CountryCodeTaggerResult alpha2Result = new CountryCodeTaggerResult();
String itemHandle = item.getHandle(); String itemHandle = item.getHandle();
List<MetadataValue> itemCountries = itemService.getMetadataByMetadataString(item, config.iso3166Field); List<MetadataValue> itemCountries =
itemService.getMetadataByMetadataString(item, config.iso3166Field);
// skip items that don't have country metadata // skip items that don't have country metadata
if (itemCountries.size() == 0) { if (itemCountries.size() == 0) {
@ -95,36 +96,60 @@ public class CountryCodeTagger extends AbstractCurationTask
} else { } else {
Gson gson = new Gson(); Gson gson = new Gson();
// TODO: convert to try: https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html // TODO: convert to try:
BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(config.isocodesJsonPath))); // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
ISO3166CountriesVocabulary isocodesCountriesJson = gson.fromJson(reader, ISO3166CountriesVocabulary.class); BufferedReader reader =
new BufferedReader(
new InputStreamReader(
this.getClass().getResourceAsStream(config.isocodesJsonPath)));
ISO3166CountriesVocabulary isocodesCountriesJson =
gson.fromJson(reader, ISO3166CountriesVocabulary.class);
reader.close(); reader.close();
reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(config.cgspaceCountriesJsonPath))); reader =
CGSpaceCountriesVocabulary cgspaceCountriesJson = gson.fromJson(reader, CGSpaceCountriesVocabulary.class); new BufferedReader(
new InputStreamReader(
this.getClass()
.getResourceAsStream(config.cgspaceCountriesJsonPath)));
CGSpaceCountriesVocabulary cgspaceCountriesJson =
gson.fromJson(reader, CGSpaceCountriesVocabulary.class);
reader.close(); reader.close();
// split the alpha2 country code field into schema, element, and qualifier so we can use it with item.addMetadata() // split the alpha2 country code field into schema, element, and qualifier so we can use
// it with item.addMetadata()
String[] iso3166Alpha2FieldParts = config.iso3166Alpha2Field.split("\\."); String[] iso3166Alpha2FieldParts = config.iso3166Alpha2Field.split("\\.");
if (config.forceupdate) { if (config.forceupdate) {
itemService.clearMetadata(Curator.curationContext(), item, iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], Item.ANY); itemService.clearMetadata(
Curator.curationContext(),
item,
iso3166Alpha2FieldParts[0],
iso3166Alpha2FieldParts[1],
iso3166Alpha2FieldParts[2],
Item.ANY);
} }
// check the item's country codes, if any // check the item's country codes, if any
List<MetadataValue> itemAlpha2CountryCodes = itemService.getMetadataByMetadataString(item, config.iso3166Alpha2Field); List<MetadataValue> itemAlpha2CountryCodes =
itemService.getMetadataByMetadataString(item, config.iso3166Alpha2Field);
if (itemAlpha2CountryCodes.size() == 0) { if (itemAlpha2CountryCodes.size() == 0) {
List<String> newAlpha2Codes = new ArrayList<String>(); List<String> newAlpha2Codes = new ArrayList<String>();
for (MetadataValue itemCountry : itemCountries) { for (MetadataValue itemCountry : itemCountries) {
//check ISO 3166-1 countries // check ISO 3166-1 countries
for (CountriesVocabulary.Country country : isocodesCountriesJson.countries) { for (CountriesVocabulary.Country country : isocodesCountriesJson.countries) {
if (itemCountry.getValue().equalsIgnoreCase(country.getName()) || itemCountry.getValue().equalsIgnoreCase(country.get_official_name()) || itemCountry.getValue().equalsIgnoreCase(country.get_common_name())) { if (itemCountry.getValue().equalsIgnoreCase(country.getName())
|| itemCountry
.getValue()
.equalsIgnoreCase(country.get_official_name())
|| itemCountry
.getValue()
.equalsIgnoreCase(country.get_common_name())) {
newAlpha2Codes.add(country.getAlpha_2()); newAlpha2Codes.add(country.getAlpha_2());
} }
} }
//check CGSpace countries // check CGSpace countries
for (CountriesVocabulary.Country country : cgspaceCountriesJson.countries) { for (CountriesVocabulary.Country country : cgspaceCountriesJson.countries) {
if (itemCountry.getValue().equalsIgnoreCase(country.getCgspace_name())) { if (itemCountry.getValue().equalsIgnoreCase(country.getCgspace_name())) {
newAlpha2Codes.add(country.getAlpha_2()); newAlpha2Codes.add(country.getAlpha_2());
@ -134,7 +159,14 @@ public class CountryCodeTagger extends AbstractCurationTask
if (newAlpha2Codes.size() > 0) { if (newAlpha2Codes.size() > 0) {
try { try {
itemService.addMetadata(Curator.curationContext(), item, iso3166Alpha2FieldParts[0], iso3166Alpha2FieldParts[1], iso3166Alpha2FieldParts[2], "en_US", newAlpha2Codes); itemService.addMetadata(
Curator.curationContext(),
item,
iso3166Alpha2FieldParts[0],
iso3166Alpha2FieldParts[1],
iso3166Alpha2FieldParts[2],
"en_US",
newAlpha2Codes);
itemService.update(Curator.curationContext(), item); itemService.update(Curator.curationContext(), item);
} catch (SQLException | AuthorizeException sqle) { } catch (SQLException | AuthorizeException sqle) {
config.log.debug(sqle.getMessage()); config.log.debug(sqle.getMessage());
@ -142,7 +174,11 @@ public class CountryCodeTagger extends AbstractCurationTask
alpha2Result.setStatus(Curator.CURATE_ERROR); alpha2Result.setStatus(Curator.CURATE_ERROR);
} }
alpha2Result.setResult(itemHandle + ": added " + newAlpha2Codes.size() + " alpha2 country code(s)"); alpha2Result.setResult(
itemHandle
+ ": added "
+ newAlpha2Codes.size()
+ " alpha2 country code(s)");
} else { } else {
alpha2Result.setResult(itemHandle + ": no matching countries found"); alpha2Result.setResult(itemHandle + ": no matching countries found");
} }

View File

@ -1,15 +1,18 @@
/* /*
* Copyright (C) 2020 Alan Orth * Copyright (C) 2020 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.ctasks; package io.github.ilri.cgspace.ctasks;
import com.google.gson.annotations.SerializedName; import com.google.gson.annotations.SerializedName;
import java.util.List; import java.util.List;
public class ISO3166CountriesVocabulary extends CountriesVocabulary { public class ISO3166CountriesVocabulary extends CountriesVocabulary {
// support reading iso_3166-1.json from Debian's iso-codes package using SerializedName since our class needs to match the JSON exactly // support reading iso_3166-1.json from Debian's iso-codes package using SerializedName since
@SerializedName("3166-1") List<Country> countries; // our class needs to match the JSON exactly
@SerializedName("3166-1")
List<Country> countries;
} }

View File

@ -1,21 +1,27 @@
/* /*
* Copyright (C) 2020 Alan Orth * Copyright (C) 2020 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.scripts; package io.github.ilri.cgspace.scripts;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.AuthorizeException;
import org.dspace.content.*; import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.MetadataValue;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants; import org.dspace.core.Constants;
import org.dspace.core.Context; import org.dspace.core.Context;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.ItemService;
import org.dspace.handle.factory.HandleServiceFactory; import org.dspace.handle.factory.HandleServiceFactory;
import org.dspace.handle.service.HandleService; import org.dspace.handle.service.HandleService;
import org.dspace.content.service.BundleService;
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
@ -29,10 +35,12 @@ import java.util.List;
* @since 5.1 * @since 5.1
*/ */
public class FixJpgJpgThumbnails { public class FixJpgJpgThumbnails {
//note: static members belong to the class itself, not any one instance // note: static members belong to the class itself, not any one instance
public static ItemService itemService = ContentServiceFactory.getInstance().getItemService(); public static ItemService itemService = ContentServiceFactory.getInstance().getItemService();
public static HandleService handleService = HandleServiceFactory.getInstance().getHandleService(); public static HandleService handleService =
public static BundleService bundleService = ContentServiceFactory.getInstance().getBundleService(); HandleServiceFactory.getInstance().getHandleService();
public static BundleService bundleService =
ContentServiceFactory.getInstance().getBundleService();
public static void main(String[] args) { public static void main(String[] args) {
String parentHandle = null; String parentHandle = null;
@ -52,12 +60,16 @@ public class FixJpgJpgThumbnails {
if (parent != null) { if (parent != null) {
switch (parent.getType()) { switch (parent.getType()) {
case Constants.COLLECTION: case Constants.COLLECTION:
process(context, itemService.findByCollection(context, (Collection) parent)); process(
context,
itemService.findByCollection(context, (Collection) parent));
break; break;
case Constants.COMMUNITY: case Constants.COMMUNITY:
List<Collection> collections = ((Community) parent).getCollections(); List<Collection> collections = ((Community) parent).getCollections();
for (Collection collection : collections) { for (Collection collection : collections) {
process(context, itemService.findAllByCollection(context, collection)); process(
context,
itemService.findAllByCollection(context, collection));
} }
break; break;
case Constants.SITE: case Constants.SITE:
@ -79,7 +91,8 @@ public class FixJpgJpgThumbnails {
} }
} }
private static void process(Context context, Iterator<Item> items) throws SQLException, IOException, AuthorizeException { private static void process(Context context, Iterator<Item> items)
throws SQLException, IOException, AuthorizeException {
while (items.hasNext()) { while (items.hasNext()) {
Item item = items.next(); Item item = items.next();
processItem(context, item); processItem(context, item);
@ -87,14 +100,18 @@ public class FixJpgJpgThumbnails {
} }
} }
private static void processItem(Context context, Item item) throws SQLException, AuthorizeException, IOException { private static void processItem(Context context, Item item)
// Some bitstreams like Infographics are large JPGs and put in the ORIGINAL bundle on purpose so we shouldn't throws SQLException, AuthorizeException, IOException {
// Some bitstreams like Infographics and Maps are large JPEGs and put in the ORIGINAL bundle
// on purpose so we shouldn't
// swap them. // swap them.
List<MetadataValue> itemTypes = itemService.getMetadataByMetadataString(item, "dcterms.type"); List<MetadataValue> itemTypes =
boolean itemHasInfographic = false; itemService.getMetadataByMetadataString(item, "dcterms.type");
for (MetadataValue itemType: itemTypes) { for (MetadataValue itemType : itemTypes) {
if (itemType.getValue().equals("Infographic")) { if (itemType.getValue().equals("Infographic") || itemType.getValue().equals("Map")) {
itemHasInfographic = true; System.out.println(
item.getHandle() + ": item has an Infographic or Map, skipping.");
return;
} }
} }
@ -103,6 +120,12 @@ public class FixJpgJpgThumbnails {
List<Bitstream> thumbnailBundleBitstreams = thumbnailBundle.getBitstreams(); List<Bitstream> thumbnailBundleBitstreams = thumbnailBundle.getBitstreams();
for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) { for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) {
String thumbnailName = thumbnailBitstream.getName(); String thumbnailName = thumbnailBitstream.getName();
String thumbnailDescription = thumbnailBitstream.getDescription();
// There is no point continuing if the thumbnail's description is empty or null
if (StringUtils.isEmpty(thumbnailDescription)) {
continue;
}
if (thumbnailName.toLowerCase().contains(".jpg.jpg")) { if (thumbnailName.toLowerCase().contains(".jpg.jpg")) {
List<Bundle> originalBundles = item.getBundles("ORIGINAL"); List<Bundle> originalBundles = item.getBundles("ORIGINAL");
@ -117,24 +140,28 @@ public class FixJpgJpgThumbnails {
/* /*
- check if the original file name is the same as the thumbnail name minus the extra ".jpg" - check if the original file name is the same as the thumbnail name minus the extra ".jpg"
- check if the thumbnail description indicates it was automatically generated - check if the thumbnail description indicates it was automatically generated
- check if the item has dc.type Infographic (JPG could be the "real" item!)
- check if the original bitstream is less than ~100KiB - check if the original bitstream is less than ~100KiB
- Note: in my tests there were 4022 items with ".jpg.jpg" thumbnails totaling 394549249 - Note: in my tests there were 4022 items with ".jpg.jpg" thumbnails totaling 394549249
bytes for an average of about 98KiB so ~100KiB seems like a good cut off bytes for an average of about 98KiB so ~100KiB seems like a good cut off
*/ */
if ( if (originalName.equalsIgnoreCase(
originalName.equalsIgnoreCase(StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg")) StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg"))
&& ("Generated Thumbnail".equals(thumbnailBitstream.getDescription()) || "IM Thumbnail".equals(thumbnailBitstream.getDescription())) && ("Generated Thumbnail".equals(thumbnailDescription)
&& !itemHasInfographic || "IM Thumbnail".equals(thumbnailDescription))
&& originalBitstreamBytes < 100000 && originalBitstreamBytes < 100000) {
) { System.out.println(
System.out.println(item.getHandle() + ": replacing " + thumbnailName + " with " + originalName); item.getHandle()
+ ": replacing "
+ thumbnailName
+ " with "
+ originalName);
//add the original bitstream to the THUMBNAIL bundle // add the original bitstream to the THUMBNAIL bundle
bundleService.addBitstream(context, thumbnailBundle, originalBitstream); bundleService.addBitstream(
//remove the original bitstream from the ORIGINAL bundle context, thumbnailBundle, originalBitstream);
// remove the original bitstream from the ORIGINAL bundle
originalBundle.removeBitstream(originalBitstream); originalBundle.removeBitstream(originalBitstream);
//remove the JpgJpg bitstream from the THUMBNAIL bundle // remove the JpgJpg bitstream from the THUMBNAIL bundle
thumbnailBundle.removeBitstream(thumbnailBitstream); thumbnailBundle.removeBitstream(thumbnailBitstream);
} }
} }

View File

@ -1,14 +1,19 @@
/* /*
* Copyright (C) 2022 Alan Orth * Copyright (C) 2022 Alan Orth
* *
* SPDX-License-Identifier: GPL-3.0-or-later * SPDX-License-Identifier: GPL-3.0-or-later
*/ */
package io.github.ilri.cgspace.scripts; package io.github.ilri.cgspace.scripts;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.AuthorizeException;
import org.dspace.content.*; import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory; import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BundleService; import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService; import org.dspace.content.service.ItemService;