From 6b648c2c85fa2e526a1f4cbe5479aba122613a8b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 5 Oct 2022 15:07:56 +0300 Subject: [PATCH] src: add FixLowQualityThumbnails.java This adds another script to detect and remove more low-quality thu- mbnails. For example: - If an item has an "IM Thumbnail" and a "Generated Thumbnail" in the THUMBNAIL bundle, remove the "Generated Thumbnail" - If an item has a PDF bitstream and a JPEG bitstream with a name or description "thumbnail" in the ORIGINAL bundle, remove the "thumbnail" bitstream in the ORIGINAL bundle and try to remove the "thumbnail.jpg" bitstream in the THUMBNAIL bundle The idea is that we should *always* prefer thumbnails generated by ImageMagick from PDFs in the ORIGINAL bundle and should remove any other manually uploaded thumbnails. --- .../scripts/FixLowQualityThumbnails.java | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 src/main/java/io/github/ilri/cgspace/scripts/FixLowQualityThumbnails.java diff --git a/src/main/java/io/github/ilri/cgspace/scripts/FixLowQualityThumbnails.java b/src/main/java/io/github/ilri/cgspace/scripts/FixLowQualityThumbnails.java new file mode 100644 index 0000000..f46af73 --- /dev/null +++ b/src/main/java/io/github/ilri/cgspace/scripts/FixLowQualityThumbnails.java @@ -0,0 +1,252 @@ +package io.github.ilri.cgspace.scripts; + +import org.apache.commons.lang.StringUtils; +import org.dspace.authorize.AuthorizeException; +import org.dspace.content.*; +import org.dspace.content.factory.ContentServiceFactory; +import org.dspace.content.service.BundleService; +import org.dspace.content.service.ItemService; +import org.dspace.core.Constants; +import org.dspace.core.Context; +import org.dspace.handle.factory.HandleServiceFactory; +import org.dspace.handle.service.HandleService; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.Iterator; +import java.util.List; + +/** + * Fix low-quality thumbnails in a DSpace repository. + * + *

Search the DSpace repository for items containing bitstreams matching the following criteria: + * + *

+ * + *

The general idea is that we should always prefer thumbnails generated from PDFs by ImageMagick + * to manually uploaded JPEGs because ImageMagick Thumbnails can be regenerated with higher quality, + * resolution, etc. Furthermore, if there are JPEG bitstreams in the ORIGINAL bundle DSpace will + * automatically create ".jpg.jpg" thumbnails from them in the THUMBNAIL bundle so we should remove + * those as well! + * + * @author Andrea Schweer schweer@waikato.ac.nz for the LCoNZ Institutional Research Repositories + * @author Alan Orth for the International Livestock Research Institute + * @version 6.1 + * @since 6.1 + * @see FixJpgJpgThumbnails + */ +public class FixLowQualityThumbnails { + // note: static members belong to the class itself, not any one instance + public static ItemService itemService = ContentServiceFactory.getInstance().getItemService(); + public static HandleService handleService = + HandleServiceFactory.getInstance().getHandleService(); + public static BundleService bundleService = + ContentServiceFactory.getInstance().getBundleService(); + + public static void main(String[] args) { + String parentHandle = null; + if (args.length >= 1) { + parentHandle = args[0]; + } + + Context context = null; + try { + context = new Context(); + context.turnOffAuthorisationSystem(); + + if (StringUtils.isBlank(parentHandle)) { + process(context, itemService.findAll(context)); + } else { + DSpaceObject parent = handleService.resolveToObject(context, parentHandle); + if (parent != null) { + switch (parent.getType()) { + case Constants.COLLECTION: + process( + context, + itemService.findByCollection(context, (Collection) parent)); + break; + case Constants.COMMUNITY: + List collections = ((Community) parent).getCollections(); + for (Collection collection : collections) { + process( + context, + itemService.findAllByCollection(context, collection)); + } + break; + case Constants.SITE: + process(context, itemService.findAll(context)); + break; + case Constants.ITEM: + processItem(context, (Item) parent); + context.commit(); + break; + } + } + } + } catch (SQLException | AuthorizeException | IOException e) { + e.printStackTrace(System.err); + } finally { + if (context != null && context.isValid()) { + context.abort(); + } + } + } + + private static void process(Context context, Iterator items) + throws SQLException, IOException, AuthorizeException { + while (items.hasNext()) { + Item item = items.next(); + processItem(context, item); + itemService.update(context, item); + } + } + + private static void processItem(Context context, Item item) + throws SQLException, AuthorizeException, IOException { + // Set some state for the item before we iterate over the THUMBNAIL bundle + boolean itemHasImThumbnail = false; + + // Iterate over the THUMBNAIL bundle to first identify if this item has an "IM Thumbnail" + List thumbnailBundles = item.getBundles("THUMBNAIL"); + for (Bundle thumbnailBundle : thumbnailBundles) { + List thumbnailBundleBitstreams = thumbnailBundle.getBitstreams(); + for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) { + String thumbnailDescription = thumbnailBitstream.getDescription(); + + // Check if this item has a bitstream in the THUMBNAIL bundle with description "IM + // Thumbnail", but only if we haven't already seen one in another iteration for this + // bundle. + if (!itemHasImThumbnail && "IM Thumbnail".equals(thumbnailDescription)) { + itemHasImThumbnail = true; + } + } + + // If this item has an IM Thumbnail we can be reasonably sure that there is a PDF + // in the ORIGINAL bundle and we don't need any other thumbnails. + if (itemHasImThumbnail) { + // Iterate over the bitstreams in the THUMBNAIL bundle again. + for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) { + String thumbnailName = thumbnailBitstream.getName(); + String thumbnailDescription = thumbnailBitstream.getDescription(); + + // If this item has a "Generated Thumbnail" we can remove it, because those + // typically come from other JPEGs in the ORIGINAL bundle and we would prefer + // the IM Thumbnail generated from a PDF anyway. The DSpace-generated descri- + // ption will *always* be "Generated Thumbnail". + if ("Generated Thumbnail".equals(thumbnailDescription)) { + System.out.print("\u001b[33m"); + System.out.println("Deleting (" + item.getHandle() + "):"); + System.out.println("> Name: »" + thumbnailName + "«"); + System.out.println("> Description: »" + thumbnailDescription + "«"); + System.out.print("\u001b[0m"); + + // Remove the "Generated Thumbnail" bitstream from the THUMBNAIL bundle + thumbnailBundle.removeBitstream(thumbnailBitstream); + + // If this item has a bitstream with the word "thumbnail" in it then we can + // remove it because we already know this item has an IM Thumbnail and we + // prefer that one. + } else if (thumbnailDescription.toLowerCase().contains("thumbnail") + && !"IM Thumbnail".equals(thumbnailDescription)) { + System.out.print("\u001b[33m"); + System.out.println("Deleting (" + item.getHandle() + "):"); + System.out.println("> Name: »" + thumbnailName + "«"); + System.out.println("> Description: »" + thumbnailDescription + "«"); + System.out.print("\u001b[0m"); + + // Remove the "thumbnail" bitstream from the THUMBNAIL bundle + thumbnailBundle.removeBitstream(thumbnailBitstream); + + // Otherwise skip it because it might be something uploaded manually, like + // a thumbnail for a journal or a limited access item. + } else { + System.out.print("\u001b[34m"); + System.out.println("Skipping (" + item.getHandle() + "):"); + System.out.println("> Name: »" + thumbnailName + "«"); + System.out.println("> Description: »" + thumbnailDescription + "«"); + System.out.print("\u001b[0m"); + } + + // Print a blank line + System.out.println(); + } + } + } + + // Set some state before we iterate over the ORIGINAL bundle + boolean itemHasOriginalPdfBitstream = false; + boolean itemHasOriginalJpegBitstream = false; + + // Iterate over the ORIGINAL bundle to delete manually-uploaded JPEG + // bitstreams labeled "Thumbnail" whenever we have a PDF because they + // don't belong in the ORIGINAL bundle and DSpace will automatically + // create a better thumbnail from the PDF anyway. + List originalBundles = item.getBundles("ORIGINAL"); + for (Bundle originalBundle : originalBundles) { + List originalBundleBitstreams = originalBundle.getBitstreams(); + for (Bitstream originalBitstream : originalBundleBitstreams) { + String originalFormat = originalBitstream.getFormat(context).getMIMEType(); + + // Check if this item has a PDF bitstream in the ORIGINAL bundle, + // but only if we haven't already seen one in another iteration + // for this bundle. DSpace will return "format application/pdf" + // for the MIME type. + if (!itemHasOriginalPdfBitstream && originalFormat.contains("application/pdf")) { + itemHasOriginalPdfBitstream = true; + } + + // Check if this item has a JPEG bitstream in the ORIGINAL bundle, + // but only if we haven't already seen one in another iteration + // for this bundle. DSpace will return "format image/jpeg" for + // the MIME type. + if (!itemHasOriginalJpegBitstream && originalFormat.contains("image/jpeg")) { + itemHasOriginalJpegBitstream = true; + } + } + + // Check if we found a PDF *and* a JPEG in this item's ORIGINAL + // bundle. + if (itemHasOriginalPdfBitstream && itemHasOriginalJpegBitstream) { + // Yes! Now iterate over the bitstreams in the ORIGINAL bundle + // again to see if the JPEG is a manually uploaded "Thumbnail" + for (Bitstream originalBitstream : originalBundleBitstreams) { + String originalName = originalBitstream.getName(); + String originalDescription = originalBitstream.getDescription(); + String originalFormat = originalBitstream.getFormat(context).getMIMEType(); + + /* + - check if the bitstream is a JPEG based on its MIME Type + - check if the bitstream's name or description is "Thumbnail" + */ + if (originalFormat.toLowerCase().contains("image/jpeg") + && (originalName.toLowerCase().contains("thumbnail") + || originalDescription.toLowerCase().contains("thumbnail"))) { + System.out.print("\u001b[33m"); + System.out.println("Removing (" + item.getHandle() + "):"); + System.out.println("> Name: »" + originalName + "«"); + System.out.println("> Description: »" + originalDescription + "«"); + System.out.print("\u001b[0m"); + + // Remove the original bitstream from the ORIGINAL bundle + originalBundle.removeBitstream(originalBitstream); + + } else { + System.out.print("\u001b[34m"); + System.out.println("Skipping (" + item.getHandle() + "):"); + System.out.println("> Name: »" + originalName + "«"); + System.out.println("> Description: »" + originalDescription + "«"); + System.out.print("\u001b[0m"); + } + + // Print a blank line + System.out.println(); + } + } + } + } +}