Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ RUN apt-get update && apt-get --yes install apt-utils && apt-get --yes upgrade \
&& apt-get --yes install poppler-data poppler-utils \
&& apt-get --yes autoremove && apt-get --yes autoclean && apt-get --yes clean \
&& useradd --create-home --home-dir /srv/dlcs --shell /bin/bash --uid 1000 dlcs \
&& python -m pip install --upgrade pip
&& python -m pip install --upgrade pip \
&& python -m pip install --upgrade setuptools

# Copy nginx config and create appropriate folders
COPY --chown=dlcs:dlcs ./nginx.conf /etc/nginx/nginx.conf
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ The following list of environment variables are supported:
| `PDF_RASTERIZER_FALLBACK_DPI` | `200` | Engine | The DPI to use for images that exceed pdftoppm memory size and produce a 1x1 pixel (see https://github.com/Belval/pdf2image/issues/34) |
| `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` |
| `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. |
| `PDF_RASTERIZER_USE_CROPBOX` | `False` | Engine | If `True` the PDF cropbox is used instead of mediabox. The MediaBox is the largest page box in a PDF. The other page boxes can equal the size of the MediaBox but they cannot be larger. The CropBox defines the region to which the page contents are to be clipped. |
| `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. |
| `DLCS_S3_BUCKET_NAME` | `dlcs-composite-images` | Engine | The S3 bucket that the Composite Handler will push rasterized images to, for consumption by the wider DLCS. Both the Composite Handler and the DLCS must have access to this bucket. |
| `DLCS_S3_OBJECT_KEY_PREFIX` | `composites` | Engine | The S3 key prefix to use when pushing images to the `DLCS_S3_BUCKET_NAME` - in other words, the folder within the S3 bucket into which images are stored. |
Expand Down
3 changes: 3 additions & 0 deletions src/app/engine/rasterizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(self):
self._fmt = settings.PDF_RASTERIZER["format"]
self._thread_count = settings.PDF_RASTERIZER["thread_count"]
self._max_length = settings.PDF_RASTERIZER["max_length"]
self._use_cropbox = settings.PDF_RASTERIZER["use_cropbox"]

def rasterize_pdf(self, subfolder_path):
# Typically, pdf2image will write generated images to a temporary path, after
Expand Down Expand Up @@ -51,6 +52,7 @@ def __rasterize(
thread_count=self._thread_count,
output_file=output_file,
output_folder=subfolder_path,
use_cropbox=self._use_cropbox,
)

def __validate_rasterized_images(self, images, pdf_source, subfolder_path):
Expand Down Expand Up @@ -90,6 +92,7 @@ def __ensure_image_size(self, idx, im: Image):
logger.info(
f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}"
)

with im.resize((scale_w, scale_h), resample=Image.LANCZOS) as resized:
resized.save(filename)
return ResizeResult.RESIZED
Expand Down
1 change: 1 addition & 0 deletions src/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@
"dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500),
"fallback_dpi": env("PDF_RASTERIZER_FALLBACK_DPI", cast=int, default=200),
"max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0),
"use_cropbox": env("PDF_RASTERIZER_USE_CROPBOX", cast=bool, default=False),
}

ORIGIN_CONFIG = {"chunk_size": env("ORIGIN_CHUNK_SIZE", cast=int, default=8192)}
Expand Down