Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Containerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ ARG FLAVOR

FROM nvcr.io/nvidia/cuda:12.6.2-devel-ubi9 as gpu-base
ARG FLAVOR
RUN dnf install -y python3.11 python3.11-pip libcudnn8 libnccl git && \
dnf clean all
RUN dnf install -y python3.11 python3.11-pip libcudnn8 libnccl git
RUN ln -sf /usr/bin/python3.11 /usr/bin/python
ENV LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH

FROM ${FLAVOR}-base as road-core-rag-builder
ARG FLAVOR

USER 0
RUN dnf install -y rubygems && \
dnf clean all && \
gem install asciidoctor

WORKDIR /rag-content
ENV EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2

Expand Down
5 changes: 5 additions & 0 deletions src/lightspeed_rag_content/asciidoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""A package that can be used to process AsciiDoc formatted files."""

from lightspeed_rag_content.asciidoc.asciidoctor_converter import AsciidoctorConverter

__all__ = ["AsciidoctorConverter"]
150 changes: 150 additions & 0 deletions src/lightspeed_rag_content/asciidoc/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright 2025 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""This module can be used to convert and investigate AsciiDoc files."""

import argparse
import logging
import shutil
import subprocess
import sys
from pathlib import Path

import yaml

from lightspeed_rag_content.asciidoc.asciidoctor_converter import (
RUBY_ASCIIDOC_DIR,
AsciidoctorConverter,
)

LOG: logging.Logger = logging.getLogger(__package__)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)


def main_convert(args: argparse.Namespace) -> None:
"""Process convert subcommand."""
try:
converter = AsciidoctorConverter(
target_format=args.target_format,
attributes_file=args.attributes_file,
converter_file=args.converter_file,
)
converter.convert(args.input_file, args.output_file)
except subprocess.CalledProcessError as e:
LOG.error(e.stderr)
sys.exit(e.returncode)
except (FileNotFoundError, yaml.YAMLError) as e:
LOG.error(str(e))
sys.exit(1)


def main_get_structure(args: argparse.Namespace) -> None:
"""Process get_structure subcommand."""
ruby_cmd = shutil.which("ruby")
if not ruby_cmd:
LOG.error("Missing ruby executable")
sys.exit(1)

try:
dumper_script_path: Path = RUBY_ASCIIDOC_DIR.joinpath(
"asciidoc_structure_dumper.rb"
)
subprocess.run( # noqa: S603
[
ruby_cmd,
str(dumper_script_path.absolute()),
str(args.input_file.absolute()),
],
check=True,
)
except subprocess.CalledProcessError as e:
LOG.error(e.stderr)
sys.exit(1)


def get_argument_parser() -> argparse.ArgumentParser:
"""Get ArgumentParser for ligthspeed_rag_content.asciidoc module."""
parser = argparse.ArgumentParser(
description="A command that can be used to convert AsciiDoc file to different"
"formats (convert) and investigate AsciiDoc file structure (get_structure).",
prog=__package__,
)
subparser = parser.add_subparsers(dest="command", required=True)

convert_parser = subparser.add_parser(
"convert",
help="Convert AsciiDoc to text formatted file.",
)
convert_parser.add_argument(
"-i",
"--input-file",
required=True,
type=Path,
help="AsciiDoc formatted file that should be converted to requested format.",
)
convert_parser.add_argument(
"-o",
"--output-file",
required=True,
type=Path,
help="A path of where the converted file should be stored.",
)
convert_parser.add_argument(
"-a",
"--attributes-file",
required=False,
type=str,
help="A file containing attributes that should be passed to asciidoctor.",
)
convert_parser.add_argument(
"-c",
"--converter-file",
required=False,
type=Path,
help="Asciidoctor compatible extension that should be used to convert the input file.",
)
convert_parser.add_argument(
"-t",
"--target-format",
required=False,
type=str,
default="text",
choices=["text", "html5", "xhtml5", "manpage"],
help="Target format in which the input file should be saved.",
)

get_structure_parser = subparser.add_parser(
"get_structure",
help="Get structure of AsciiDoc formatted file.",
)
get_structure_parser.add_argument(
"input_file",
type=Path,
help="AsciiDoc formatted file that should be investigated.",
)

return parser


if __name__ == "__main__":
parser = get_argument_parser()
args = parser.parse_args()

if args.command == "convert":
main_convert(args)
else:
main_get_structure(args)
186 changes: 186 additions & 0 deletions src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Copyright 2025 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

"""This module contains AsciidoctorConverter that can be used to convert AsciiDoc files.

The code in this module is heavily dependent on ruby and asciidoctor. These commands
must be installed before using this module. Otherwise, monsters and dragons await you!

Typical usage example:

>>> adoc_converter = AsciidoctorConverter()
>>> adoc_converter.convert(Path("input.adoc"), Path("output.txt"))

An example of more involved usage:

>>> adoc_converter = AsciidoctorConverter(
... target_format='custom',
... attributes_file=Path('./attributes.yaml'),
... converter_file=Path('./asciidoc_custom_format_converter.rb'),
... )
>>> adoc_converter.convert(Path("input.adoc"), Path("output.custom"))

'attributes.yaml' content:

---
attribute_name_1: attribute_value_1
attribute_name_2: attribute_value_2
...

'asciidoc_custom_format_converter.rb' has to be compatible with asciidoctor.
Please read: https://docs.asciidoctor.org/asciidoctor/latest/extensions/
You can also investigate the default text converter 'asciidoc_text_converter.rb'
stored in the asciidoc package.
"""

import logging
import shutil
import subprocess
from importlib import resources
from pathlib import Path

import yaml

LOG: logging.Logger = logging.getLogger(__name__)

RUBY_ASCIIDOC_DIR: Path = Path(resources.files(__package__)).joinpath("ruby_asciidoc")


class AsciidoctorConverter:
"""Convert AsciiDoc formatted documents to different formats.

The class requires asciidoctor to be installed. By default, all files are
converted to text format using a custom asciidoctor compatible extension
that is written in Ruby.
"""

def __init__(
self,
target_format: str = "text",
attributes_file: Path | None = None,
converter_file: Path | None = None,
):
"""Initialize AsciidoctorConverter.

Args:
target_format:
A format to which input files should be converted. These formats
are currently supported: text, html5, xhtml5, manpage.
attributes_file: A path pointing to an attributes file.
converter_file: An asciidoctor compatible extension.

Raises:
FileNotFoundError:
When asciidoctor executable or converter_file cannot be found.

yaml.YAMLError:
When attributes_file is not valid YAML file.
"""
self.target_format = target_format
self.attribute_list = self._get_attribute_list(attributes_file)

if not converter_file:
self.converter_file = self._get_converter_file(target_format)
else:
self.converter_file = converter_file

self.asciidoctor_cmd = self._get_asciidoctor_path()

@staticmethod
def _get_converter_file(target_format: str) -> Path | None:
"""Return converter file if target_format requires one."""
asciidoctor_supported_formats = ["html5", "xhtml5", "manpage"]
if target_format in asciidoctor_supported_formats:
return None

converter_files = {
"text": "asciidoc_text_converter.rb",
}

if not (converter_file := converter_files.get(target_format, None)):
raise FileNotFoundError(
f"There is no extension available for target format: {target_format}"
)

return RUBY_ASCIIDOC_DIR.joinpath(converter_file)

@staticmethod
def _get_asciidoctor_path() -> str:
"""Check whether asciidoctor and ruby are installed."""
asciidoctor_path = shutil.which("asciidoctor")
if not asciidoctor_path:
raise FileNotFoundError("asciidoctor executable not found")

LOG.info("Using asciidoctor with %s path", asciidoctor_path)
return asciidoctor_path

@staticmethod
def _get_attribute_list(attributes_file: Path | None) -> list:
"""Convert file containing attributes to list of '-a <key>=<value>'."""
attribute_list: list = []

if attributes_file is None:
return attribute_list

with open(attributes_file, "r") as file:
if (attributes := yaml.safe_load(file)) is None:
return attribute_list

for key, value in attributes.items():
attribute_list += ["-a", key + "=%s" % value]

return attribute_list

def convert(self, source_file: Path, destination_file: Path) -> None:
"""Convert AsciiDoc formatted file to target format.

Args:
source_file: A path of a file that should be converted.
destination_file:
A path of where the converted file should be stored. If
the directories in the path do not exist, they will be created

Raises:
subprocess.CalledSubprocessError:
If an error occurs when running asciidoctor.
"""
LOG.info("Processing: %s", str(source_file.absolute()))
if not destination_file.exists():
destination_file.parent.mkdir(parents=True, exist_ok=True)
else:
LOG.warning(
"Destination file %s exists. It will be overwritten!",
destination_file,
)

command = [self.asciidoctor_cmd]

if self.attribute_list:
command += self.attribute_list
if self.converter_file:
command += ["-r", str(self.converter_file.absolute())]

command = [
*command,
"-b",
self.target_format,
"-o",
str(destination_file.absolute()),
"--trace",
"--quiet",
str(source_file.absolute()),
]

subprocess.run(command, check=True, capture_output=True) # noqa: S603
Loading