src: add FixLowQualityThumbnails.java

This adds another script to detect and remove more low-quality thu-
mbnails. For example:

- If an item has an "IM Thumbnail" and a "Generated Thumbnail" in the
  THUMBNAIL bundle, remove the "Generated Thumbnail"
- If an item has a PDF bitstream and a JPEG bitstream with a name or
  description "thumbnail" in the ORIGINAL bundle, remove the
  "thumbnail" bitstream in the ORIGINAL bundle and try to remove the
  "thumbnail.jpg" bitstream in the THUMBNAIL bundle

The idea is that we should *always* prefer thumbnails generated by
ImageMagick from PDFs in the ORIGINAL bundle and should remove any
other manually uploaded thumbnails.
This commit is contained in:
Alan Orth 2022-10-05 15:07:56 +03:00
parent 781ddcd931
commit 6b648c2c85
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
1 changed files with 252 additions and 0 deletions

View File

@ -0,0 +1,252 @@
package io.github.ilri.cgspace.scripts;
import org.apache.commons.lang.StringUtils;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.*;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.handle.factory.HandleServiceFactory;
import org.dspace.handle.service.HandleService;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Iterator;
import java.util.List;
/**
* Fix low-quality thumbnails in a DSpace repository.
*
* <p>Search the DSpace repository for items containing bitstreams matching the following criteria:
*
* <ul>
* <li>If an item has an <code>IM Thumbnail</code> and a <code>Generated Thumbnail</code> in the
* <code>THUMBNAIL</code> bundle, remove the <code>Generated Thumbnail</code>.
* <li>If an item has a PDF bitstream and a JPEG bitstream with description "thumbnail" in the
* <code>ORIGINAL</code> bundle, remove the "thumbnail" bitstream in the ORIGINAL bundle.
* </ul>
*
* <p>The general idea is that we should always prefer thumbnails generated from PDFs by ImageMagick
* to manually uploaded JPEGs because ImageMagick Thumbnails can be regenerated with higher quality,
* resolution, etc. Furthermore, if there are JPEG bitstreams in the ORIGINAL bundle DSpace will
* automatically create ".jpg.jpg" thumbnails from them in the THUMBNAIL bundle so we should remove
* those as well!
*
* @author Andrea Schweer schweer@waikato.ac.nz for the LCoNZ Institutional Research Repositories
* @author Alan Orth for the International Livestock Research Institute
* @version 6.1
* @since 6.1
* @see FixJpgJpgThumbnails
*/
public class FixLowQualityThumbnails {
// note: static members belong to the class itself, not any one instance
public static ItemService itemService = ContentServiceFactory.getInstance().getItemService();
public static HandleService handleService =
HandleServiceFactory.getInstance().getHandleService();
public static BundleService bundleService =
ContentServiceFactory.getInstance().getBundleService();
public static void main(String[] args) {
String parentHandle = null;
if (args.length >= 1) {
parentHandle = args[0];
}
Context context = null;
try {
context = new Context();
context.turnOffAuthorisationSystem();
if (StringUtils.isBlank(parentHandle)) {
process(context, itemService.findAll(context));
} else {
DSpaceObject parent = handleService.resolveToObject(context, parentHandle);
if (parent != null) {
switch (parent.getType()) {
case Constants.COLLECTION:
process(
context,
itemService.findByCollection(context, (Collection) parent));
break;
case Constants.COMMUNITY:
List<Collection> collections = ((Community) parent).getCollections();
for (Collection collection : collections) {
process(
context,
itemService.findAllByCollection(context, collection));
}
break;
case Constants.SITE:
process(context, itemService.findAll(context));
break;
case Constants.ITEM:
processItem(context, (Item) parent);
context.commit();
break;
}
}
}
} catch (SQLException | AuthorizeException | IOException e) {
e.printStackTrace(System.err);
} finally {
if (context != null && context.isValid()) {
context.abort();
}
}
}
private static void process(Context context, Iterator<Item> items)
throws SQLException, IOException, AuthorizeException {
while (items.hasNext()) {
Item item = items.next();
processItem(context, item);
itemService.update(context, item);
}
}
private static void processItem(Context context, Item item)
throws SQLException, AuthorizeException, IOException {
// Set some state for the item before we iterate over the THUMBNAIL bundle
boolean itemHasImThumbnail = false;
// Iterate over the THUMBNAIL bundle to first identify if this item has an "IM Thumbnail"
List<Bundle> thumbnailBundles = item.getBundles("THUMBNAIL");
for (Bundle thumbnailBundle : thumbnailBundles) {
List<Bitstream> thumbnailBundleBitstreams = thumbnailBundle.getBitstreams();
for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) {
String thumbnailDescription = thumbnailBitstream.getDescription();
// Check if this item has a bitstream in the THUMBNAIL bundle with description "IM
// Thumbnail", but only if we haven't already seen one in another iteration for this
// bundle.
if (!itemHasImThumbnail && "IM Thumbnail".equals(thumbnailDescription)) {
itemHasImThumbnail = true;
}
}
// If this item has an IM Thumbnail we can be reasonably sure that there is a PDF
// in the ORIGINAL bundle and we don't need any other thumbnails.
if (itemHasImThumbnail) {
// Iterate over the bitstreams in the THUMBNAIL bundle again.
for (Bitstream thumbnailBitstream : thumbnailBundleBitstreams) {
String thumbnailName = thumbnailBitstream.getName();
String thumbnailDescription = thumbnailBitstream.getDescription();
// If this item has a "Generated Thumbnail" we can remove it, because those
// typically come from other JPEGs in the ORIGINAL bundle and we would prefer
// the IM Thumbnail generated from a PDF anyway. The DSpace-generated descri-
// ption will *always* be "Generated Thumbnail".
if ("Generated Thumbnail".equals(thumbnailDescription)) {
System.out.print("\u001b[33m");
System.out.println("Deleting (" + item.getHandle() + "):");
System.out.println("> Name: »" + thumbnailName + "«");
System.out.println("> Description: »" + thumbnailDescription + "«");
System.out.print("\u001b[0m");
// Remove the "Generated Thumbnail" bitstream from the THUMBNAIL bundle
thumbnailBundle.removeBitstream(thumbnailBitstream);
// If this item has a bitstream with the word "thumbnail" in it then we can
// remove it because we already know this item has an IM Thumbnail and we
// prefer that one.
} else if (thumbnailDescription.toLowerCase().contains("thumbnail")
&& !"IM Thumbnail".equals(thumbnailDescription)) {
System.out.print("\u001b[33m");
System.out.println("Deleting (" + item.getHandle() + "):");
System.out.println("> Name: »" + thumbnailName + "«");
System.out.println("> Description: »" + thumbnailDescription + "«");
System.out.print("\u001b[0m");
// Remove the "thumbnail" bitstream from the THUMBNAIL bundle
thumbnailBundle.removeBitstream(thumbnailBitstream);
// Otherwise skip it because it might be something uploaded manually, like
// a thumbnail for a journal or a limited access item.
} else {
System.out.print("\u001b[34m");
System.out.println("Skipping (" + item.getHandle() + "):");
System.out.println("> Name: »" + thumbnailName + "«");
System.out.println("> Description: »" + thumbnailDescription + "«");
System.out.print("\u001b[0m");
}
// Print a blank line
System.out.println();
}
}
}
// Set some state before we iterate over the ORIGINAL bundle
boolean itemHasOriginalPdfBitstream = false;
boolean itemHasOriginalJpegBitstream = false;
// Iterate over the ORIGINAL bundle to delete manually-uploaded JPEG
// bitstreams labeled "Thumbnail" whenever we have a PDF because they
// don't belong in the ORIGINAL bundle and DSpace will automatically
// create a better thumbnail from the PDF anyway.
List<Bundle> originalBundles = item.getBundles("ORIGINAL");
for (Bundle originalBundle : originalBundles) {
List<Bitstream> originalBundleBitstreams = originalBundle.getBitstreams();
for (Bitstream originalBitstream : originalBundleBitstreams) {
String originalFormat = originalBitstream.getFormat(context).getMIMEType();
// Check if this item has a PDF bitstream in the ORIGINAL bundle,
// but only if we haven't already seen one in another iteration
// for this bundle. DSpace will return "format application/pdf"
// for the MIME type.
if (!itemHasOriginalPdfBitstream && originalFormat.contains("application/pdf")) {
itemHasOriginalPdfBitstream = true;
}
// Check if this item has a JPEG bitstream in the ORIGINAL bundle,
// but only if we haven't already seen one in another iteration
// for this bundle. DSpace will return "format image/jpeg" for
// the MIME type.
if (!itemHasOriginalJpegBitstream && originalFormat.contains("image/jpeg")) {
itemHasOriginalJpegBitstream = true;
}
}
// Check if we found a PDF *and* a JPEG in this item's ORIGINAL
// bundle.
if (itemHasOriginalPdfBitstream && itemHasOriginalJpegBitstream) {
// Yes! Now iterate over the bitstreams in the ORIGINAL bundle
// again to see if the JPEG is a manually uploaded "Thumbnail"
for (Bitstream originalBitstream : originalBundleBitstreams) {
String originalName = originalBitstream.getName();
String originalDescription = originalBitstream.getDescription();
String originalFormat = originalBitstream.getFormat(context).getMIMEType();
/*
- check if the bitstream is a JPEG based on its MIME Type
- check if the bitstream's name or description is "Thumbnail"
*/
if (originalFormat.toLowerCase().contains("image/jpeg")
&& (originalName.toLowerCase().contains("thumbnail")
|| originalDescription.toLowerCase().contains("thumbnail"))) {
System.out.print("\u001b[33m");
System.out.println("Removing (" + item.getHandle() + "):");
System.out.println("> Name: »" + originalName + "«");
System.out.println("> Description: »" + originalDescription + "«");
System.out.print("\u001b[0m");
// Remove the original bitstream from the ORIGINAL bundle
originalBundle.removeBitstream(originalBitstream);
} else {
System.out.print("\u001b[34m");
System.out.println("Skipping (" + item.getHandle() + "):");
System.out.println("> Name: »" + originalName + "«");
System.out.println("> Description: »" + originalDescription + "«");
System.out.print("\u001b[0m");
}
// Print a blank line
System.out.println();
}
}
}
}
}