From 26d3cbd77807294dbd73e9779e114f2435c59c9b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 7 Aug 2020 09:50:03 +0300 Subject: [PATCH] src/main/java: Tune FixJpgJpgThumbnails a bit Make sure we don't modify thumbnails if the item is an Infographic because the JPG in the ORIGINAL bundle might actually be the "real" file, in which case the THUMBNAIL bundle would have a legitimate ".jpg.jpg" file. Also, limit the criteria for replacement to original bitstreams that are less than 100KiB. In my tests I found that we had 4,022 items with ".jpg.jpg" thumbnails, and the average file size of the originals in those items was 98KiB. Without considering the large inforgraphics, which are several megabytes apiece, the average of the remaining 3,765 originals was ~20KiB so 100KiB should be very safe. --- .../cgspace/scripts/FixJpgJpgThumbnails.java | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/main/java/io/github/ilri/cgspace/scripts/FixJpgJpgThumbnails.java b/src/main/java/io/github/ilri/cgspace/scripts/FixJpgJpgThumbnails.java index 0d09f45..8bc43f5 100644 --- a/src/main/java/io/github/ilri/cgspace/scripts/FixJpgJpgThumbnails.java +++ b/src/main/java/io/github/ilri/cgspace/scripts/FixJpgJpgThumbnails.java @@ -13,8 +13,8 @@ import java.sql.SQLException; /** * @author Andrea Schweer schweer@waikato.ac.nz for the LCoNZ Institutional Research Repositories * @author Alan Orth for the International Livestock Research Institute - * @version 5.1-SNAPSHOT - * @since 5.1-SNAPSHOT + * @version 5.3 + * @since 5.1 */ public class FixJpgJpgThumbnails { @@ -73,6 +73,16 @@ public class FixJpgJpgThumbnails { } private static void processItem(Item item) throws SQLException, AuthorizeException, IOException { + // Some bitstreams like Infographics are large JPGs and put in the ORIGINAL bundle on purpose so we shouldn't + // swap them. + Metadatum[] itemTypes = item.getMetadataByMetadataString("dc.type"); + Boolean itemHasInfographic = false; + for (Metadatum itemType: itemTypes) { + if (itemType.value.equals("Infographic")) { + itemHasInfographic = true; + } + } + Bundle[] thumbnailBundles = item.getBundles("THUMBNAIL"); for (Bundle thumbnailBundle : thumbnailBundles) { Bitstream[] thumbnailBundleBitstreams = thumbnailBundle.getBitstreams(); @@ -84,11 +94,25 @@ public class FixJpgJpgThumbnails { for (Bundle originalBundle : originalBundles) { Bitstream[] originalBundleBitstreams = originalBundle.getBitstreams(); - for(Bitstream originalBitstream : originalBundleBitstreams) { + for (Bitstream originalBitstream : originalBundleBitstreams) { String originalName = originalBitstream.getName(); - //check if the original file name is the same as the thumbnail name minus the extra ".jpg" - if (originalName.equalsIgnoreCase(StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg")) && ("Generated Thumbnail".equals(thumbnailBitstream.getDescription()) || "IM Thumbnail".equals(thumbnailBitstream.getDescription()))) { + Long originalBitstreamBytes = originalBitstream.getSize(); + + /* + - check if the original file name is the same as the thumbnail name minus the extra ".jpg" + - check if the thumbnail description indicates it was automatically generated + - check if the item has dc.type Infographic (JPG could be the "real" item!) + - check if the original bitstream is less than ~100KiB + - Note: in my tests there were 4022 items with ".jpg.jpg" thumbnails totaling 394549249 + bytes for an average of about 98KiB so ~100KiB seems like a good cut off + */ + if ( + originalName.equalsIgnoreCase(StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg")) + && ("Generated Thumbnail".equals(thumbnailBitstream.getDescription()) || "IM Thumbnail".equals(thumbnailBitstream.getDescription())) + && !itemHasInfographic + && originalBitstreamBytes < 100000 + ) { System.out.println(item.getHandle() + ": replacing " + thumbnailName + " with " + originalName); //add the original bitstream to the THUMBNAIL bundle