diff --git a/Dockerfile b/Dockerfile index 3a273ec..31b1b02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,8 @@ RUN apt-get update && apt-get --yes install apt-utils && apt-get --yes upgrade \ && apt-get --yes install poppler-data poppler-utils \ && apt-get --yes autoremove && apt-get --yes autoclean && apt-get --yes clean \ && useradd --create-home --home-dir /srv/dlcs --shell /bin/bash --uid 1000 dlcs \ - && python -m pip install --upgrade pip + && python -m pip install --upgrade pip \ + && python -m pip install --upgrade setuptools # Copy nginx config and create appropriate folders COPY --chown=dlcs:dlcs ./nginx.conf /etc/nginx/nginx.conf diff --git a/README.md b/README.md index f635ad9..874284c 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ The following list of environment variables are supported: | `PDF_RASTERIZER_FALLBACK_DPI` | `200` | Engine | The DPI to use for images that exceed pdftoppm memory size and produce a 1x1 pixel (see https://github.com/Belval/pdf2image/issues/34) | | `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` | | `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. | +| `PDF_RASTERIZER_USE_CROPBOX` | `False` | Engine | If `True` the PDF cropbox is used instead of mediabox. The MediaBox is the largest page box in a PDF. The other page boxes can equal the size of the MediaBox but they cannot be larger. The CropBox defines the region to which the page contents are to be clipped. | | `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. | | `DLCS_S3_BUCKET_NAME` | `dlcs-composite-images` | Engine | The S3 bucket that the Composite Handler will push rasterized images to, for consumption by the wider DLCS. Both the Composite Handler and the DLCS must have access to this bucket. | | `DLCS_S3_OBJECT_KEY_PREFIX` | `composites` | Engine | The S3 key prefix to use when pushing images to the `DLCS_S3_BUCKET_NAME` - in other words, the folder within the S3 bucket into which images are stored. | diff --git a/src/app/engine/rasterizers.py b/src/app/engine/rasterizers.py index f056a81..0c2aac9 100644 --- a/src/app/engine/rasterizers.py +++ b/src/app/engine/rasterizers.py @@ -24,6 +24,7 @@ def __init__(self): self._fmt = settings.PDF_RASTERIZER["format"] self._thread_count = settings.PDF_RASTERIZER["thread_count"] self._max_length = settings.PDF_RASTERIZER["max_length"] + self._use_cropbox = settings.PDF_RASTERIZER["use_cropbox"] def rasterize_pdf(self, subfolder_path): # Typically, pdf2image will write generated images to a temporary path, after @@ -51,6 +52,7 @@ def __rasterize( thread_count=self._thread_count, output_file=output_file, output_folder=subfolder_path, + use_cropbox=self._use_cropbox, ) def __validate_rasterized_images(self, images, pdf_source, subfolder_path): @@ -90,6 +92,7 @@ def __ensure_image_size(self, idx, im: Image): logger.info( f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}" ) + with im.resize((scale_w, scale_h), resample=Image.LANCZOS) as resized: resized.save(filename) return ResizeResult.RESIZED diff --git a/src/app/settings.py b/src/app/settings.py index 58f1c28..1ae1cb8 100644 --- a/src/app/settings.py +++ b/src/app/settings.py @@ -177,6 +177,7 @@ "dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500), "fallback_dpi": env("PDF_RASTERIZER_FALLBACK_DPI", cast=int, default=200), "max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0), + "use_cropbox": env("PDF_RASTERIZER_USE_CROPBOX", cast=bool, default=False), } ORIGIN_CONFIG = {"chunk_size": env("ORIGIN_CHUNK_SIZE", cast=int, default=8192)}