From 83d674a742d55d4e07c3cc8467f49b75676560bc Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Mon, 17 Mar 2025 18:31:31 +0200 Subject: [PATCH 1/5] Add lightspeed_rag_content.asciidoc subpackage This commit adds the ligthspeed_rag_content.asciidoc package. The purpose of this package is to: 1. Provide an interface for easy conversion of AsciiDoc formatted files, mainly to text format. The AsciidoctorConverter class can be used to convert AsciiDoc files. On the backend, the class uses asciidoctor tool [1]. This makes the package dependent on this tool and ruby. The main reason for picking this tool is that as of now there is no easy way to convert AsciiDoc formatted files to text format using pure Python and as we have already an extension written for asciidoctor, we can reuse it. This commit does not rule out the possibility of introducing a new converter later with a more suitable backend based on pure Python. One can convert the .adoc file either by using the AsciidoctorConverter class or by using the lightspeed_rag_content.asciidoc module as follows: python -m lightspeed_rag_content.asciidoc convert \ -i input_file.adoc -o output_file.txt 2. Allow investigation of a structure of AsciiDoc formatted files. The introduced package wraps an already existing ruby script that dumps a file structure of adoc file. This comes handy when writing custom ruby extension for asciidoctor. The script can be used as follows: python -m lightspeed_rag_content.asciidoc get_structure input.adoc [1] https://asciidoctor.org/ Signed-off-by: Lukas Piwowarski --- .../asciidoc/__init__.py | 5 + .../asciidoc/__main__.py | 149 +++++++++++++++ .../asciidoc/asciidoctor_converter.py | 179 ++++++++++++++++++ .../asciidoc_structure_dumper.rb | 64 +++++++ .../ruby_asciidoc/asciidoc_text_converter.rb | 159 ++++++++++++++++ 5 files changed, 556 insertions(+) create mode 100644 src/lightspeed_rag_content/asciidoc/__init__.py create mode 100644 src/lightspeed_rag_content/asciidoc/__main__.py create mode 100644 src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py create mode 100644 src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_structure_dumper.rb create mode 100644 src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_text_converter.rb diff --git a/src/lightspeed_rag_content/asciidoc/__init__.py b/src/lightspeed_rag_content/asciidoc/__init__.py new file mode 100644 index 00000000..411b9b0a --- /dev/null +++ b/src/lightspeed_rag_content/asciidoc/__init__.py @@ -0,0 +1,5 @@ +"""A package that can be used to process AsciiDoc formatted files.""" + +from lightspeed_rag_content.asciidoc.asciidoctor_converter import AsciidoctorConverter + +__all__ = ["AsciidoctorConverter"] diff --git a/src/lightspeed_rag_content/asciidoc/__main__.py b/src/lightspeed_rag_content/asciidoc/__main__.py new file mode 100644 index 00000000..76957467 --- /dev/null +++ b/src/lightspeed_rag_content/asciidoc/__main__.py @@ -0,0 +1,149 @@ +# Copyright 2025 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +"""This module can be used to convert and investigate AsciiDoc files.""" + +import argparse +import logging +import shutil +import subprocess +import sys +from pathlib import Path + +import yaml + +from lightspeed_rag_content.asciidoc.asciidoctor_converter import ( + RUBY_ASCIIDOC_DIR, + AsciidoctorConverter, +) + +LOG: logging.Logger = logging.getLogger(__package__) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + + +def main_convert(args: argparse.Namespace) -> None: + """Process convert subcommand.""" + try: + converter = AsciidoctorConverter( + target_format=args.target_format, + attributes_file=args.attributes_file, + converter_file=args.converter_file, + ) + converter.convert(args.input_file, args.output_file) + except subprocess.CalledProcessError as e: + LOG.error(e.stderr) + sys.exit(e.returncode) + except (FileNotFoundError, yaml.YAMLError) as e: + LOG.error(str(e)) + sys.exit(1) + + +def main_get_structure(args: argparse.Namespace) -> None: + """Process get_structure subcommand.""" + ruby_cmd = shutil.which("ruby") + if not ruby_cmd: + LOG.error("Missing ruby executable") + sys.exit(1) + + try: + dumper_script_path: Path = RUBY_ASCIIDOC_DIR.joinpath( + "asciidoc_structure_dumper.rb" + ) + subprocess.run( # noqa: S603 + [ + ruby_cmd, + str(dumper_script_path.absolute()), + str(args.input_file.absolute()), + ], + check=True, + ) + except subprocess.CalledProcessError as e: + LOG.error(e.stderr) + sys.exit(1) + + +def get_argument_parser() -> argparse.ArgumentParser: + """Get ArgumentParser for ligthspeed_rag_content.asciidoc module.""" + parser = argparse.ArgumentParser( + description="A command that can be used to convert AsciiDoc file to different" + "formats (convert) and investigate AsciiDoc file structure (get_structure).", + prog=__package__, + ) + subparser = parser.add_subparsers(dest="command", required=True) + + convert_parser = subparser.add_parser( + "convert", + help="Convert AsciiDoc to text formatted file.", + ) + convert_parser.add_argument( + "-i", + "--input-file", + required=True, + type=Path, + help="AsciiDoc formatted file that should be converted to requested format.", + ) + convert_parser.add_argument( + "-o", + "--output-file", + required=True, + type=Path, + help="A path of where the converted file should be stored.", + ) + convert_parser.add_argument( + "-a", + "--attributes-file", + required=False, + type=str, + help="A file containing attributes that should be passed to asciidoctor.", + ) + convert_parser.add_argument( + "-c", + "--converter-file", + required=False, + type=Path, + help="Asciidoctor compatible extension that should be used to convert the input file.", + ) + convert_parser.add_argument( + "-t", + "--target-format", + required=False, + type=str, + default="text", + help="Target format in which the input file should be saved.", + ) + + get_structure_parser = subparser.add_parser( + "get_structure", + help="Get structure of AsciiDoc formatted file.", + ) + get_structure_parser.add_argument( + "input_file", + type=Path, + help="AsciiDoc formatted file that should be investigated.", + ) + + return parser + + +if __name__ == "__main__": + parser = get_argument_parser() + args = parser.parse_args() + + if args.command == "convert": + main_convert(args) + else: + main_get_structure(args) diff --git a/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py new file mode 100644 index 00000000..8d4650bc --- /dev/null +++ b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py @@ -0,0 +1,179 @@ +# Copyright 2025 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""This module contains AsciidoctorConverter that can be used to convert AsciiDoc files. + +The code in this module is heavily dependent on ruby and asciidoctor. These commands +must be installed before using this module. Otherwise, monsters and dragons await you! + +Typical usage example: + + >>> adoc_converter = AsciidoctorConverter() + >>> adoc_converter.convert(Path("input.adoc"), Path("output.txt")) + +An example of more involved usage: + + >>> adoc_converter = AsciidoctorConverter( + ... target_format='custom', + ... attributes_file=Path('./attributes.yaml'), + ... converter_file=Path('./asciidoc_custom_format_converter.rb'), + ... ) + >>> adoc_converter.convert(Path("input.adoc"), Path("output.custom")) + +'attributes.yaml' content: + + --- + attribute_name_1: attribute_value_1 + attribute_name_2: attribute_value_2 + ... + +'asciidoc_custom_format_converter.rb' has to be compatible with asciidoctor. +Please read: https://docs.asciidoctor.org/asciidoctor/latest/extensions/ +You can also investigate the default text converter 'asciidoc_text_converter.rb' +stored in the asciidoc package. +""" + +import logging +import shutil +import subprocess +from importlib import resources +from pathlib import Path + +import yaml + +LOG: logging.Logger = logging.getLogger(__name__) + +RUBY_ASCIIDOC_DIR: Path = Path(resources.files(__package__)).joinpath("ruby_asciidoc") + + +class AsciidoctorConverter: + """Convert AsciiDoc formatted documents to different formats. + + The class requires asciidoctor to be installed. By default, all files are + converted to text format using a custom asciidoctor compatible extension + that is written in Ruby. + """ + + def __init__( + self, + target_format: str = "text", + attributes_file: Path | None = None, + converter_file: Path | None = None, + ): + """Initialize AsciidoctorConverter. + + Args: + target_format: A format to which input files should be converted. + attributes_file: A path pointing to an attributes file. + converter_file: An asciidoctor compatible extension. + + Raises: + FileNotFoundError: + When asciidoctor executable or converter_file cannot be found. + + yaml.YAMLError: + When attributes_file is not valid YAML file. + """ + self.target_format = target_format + self.attribute_list = self._get_attribute_list(attributes_file) + + if converter_file: + self.converter_file = converter_file + else: + self.converter_file = self._get_default_converter_file(target_format) + + self.asciidoctor_cmd = self._get_asciidoctor_path() + + @staticmethod + def _get_default_converter_file(target_format: str) -> Path: + """Return path to asciidoctor Ruby based extension.""" + converter_files = { + "text": "asciidoc_text_converter.rb", + } + + if not (converter_file := converter_files.get(target_format, None)): + raise FileNotFoundError( + f"There is no built-in extension for target format: {target_format}" + ) + + return RUBY_ASCIIDOC_DIR.joinpath(converter_file) + + @staticmethod + def _get_asciidoctor_path() -> str: + """Check whether asciidoctor and ruby are installed.""" + asciidoctor_path = shutil.which("asciidoctor") + if not asciidoctor_path: + raise FileNotFoundError("asciidoctor executable not found") + + LOG.info(f"Using asciidoctor with {asciidoctor_path} path") + return asciidoctor_path + + @staticmethod + def _get_attribute_list(attributes_file: Path | None) -> list: + """Convert file containing attributes to list of '-a ='.""" + attribute_list: list = [] + + if attributes_file is None: + return attribute_list + + with open(attributes_file, "r") as file: + if (attributes := yaml.safe_load(file)) is None: + return attribute_list + + for key, value in attributes.items(): + attribute_list += ["-a", key + "=%s" % value] + + return attribute_list + + def convert(self, source_file: Path, destination_file: Path) -> None: + """Convert AsciiDoc formatted file to target format. + + Args: + source_file: A path of a file that should be converted. + destination_file: + A path of where the converted file should be stored. If + the directories in the path do not exist, they will be created + + Raises: + subprocess.CalledSubprocessError: + If an error occurs when running asciidoctor. + """ + LOG.info("Processing: " + str(source_file.absolute())) + if not destination_file.exists(): + destination_file.parent.mkdir(parents=True, exist_ok=True) + else: + LOG.warning( + f"Destination file {destination_file} exists. It will be overwritten!" + ) + + command = [self.asciidoctor_cmd] + + if self.attribute_list: + command += self.attribute_list + if self.converter_file: + command += ["-r", str(self.converter_file.absolute())] + + command = [ + *command, + "-b", + self.target_format, + "-o", + str(destination_file.absolute()), + "--trace", + "--quiet", + str(source_file.absolute()), + ] + + subprocess.run(command, check=True, capture_output=True) # noqa: S603 diff --git a/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_structure_dumper.rb b/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_structure_dumper.rb new file mode 100644 index 00000000..2960bb45 --- /dev/null +++ b/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_structure_dumper.rb @@ -0,0 +1,64 @@ +# asciidoc structure dumper script +# + +require 'asciidoctor' + +# Function to recursively dump object structures +def dump_structure(node, indent = 0) + indent_str = ' ' * indent + node_info = "#{indent_str}#<#{node.class.name}" + + # Collecting attributes + attributes = {} + if node.respond_to?(:context) + attributes[:context] = node.context + end + if node.respond_to?(:title) + attributes[:title] = node.title + end + if node.respond_to?(:level) + attributes[:level] = node.level + end + if node.respond_to?(:text) + attributes[:text] = node.text + end + if node.respond_to?(:blocks) + attributes[:blocks] = node.blocks.size + end + + # Adding attributes to the node info + unless attributes.empty? + attributes_str = attributes.map { |key, value| "#{key}: #{value.inspect}" }.join(', ') + node_info += " {#{attributes_str}}" + end + node_info += '>' + + puts node_info + + # Recursively process child blocks + if node.respond_to?(:blocks) && node.blocks.any? + node.blocks.each { |child| dump_structure(child, indent + 2) } + end +end + +# Load and parse the AsciiDoc file +def load_and_dump_asciidoc(file_path) + # Read the file content + asciidoc_content = File.read(file_path) + + # Parse the AsciiDoc content + doc = Asciidoctor.load(asciidoc_content) + + # Dump the structure of the document + dump_structure(doc) +end + +# Check if the script is run with a file path argument +if ARGV.size != 1 + puts "Usage: ruby asciidoc_structure_dumper.rb " + exit 1 +end + +# Load and process the AsciiDoc file +file_path = ARGV[0] +load_and_dump_asciidoc(file_path) diff --git a/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_text_converter.rb b/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_text_converter.rb new file mode 100644 index 00000000..60066fc9 --- /dev/null +++ b/src/lightspeed_rag_content/asciidoc/ruby_asciidoc/asciidoc_text_converter.rb @@ -0,0 +1,159 @@ +# adoc to plaintext converter plugin for asciidoctor +# + +module Asciidoctor + class Converter::TextConverter < Converter::Base + register_for 'text' + + def convert(node, transform = node.node_name, opts = nil) + case transform + when 'document' + convert_document(node) + when 'section' + convert_section(node) + when 'paragraph' + node.content + when 'ulist' + convert_ulist(node) + when 'olist' + convert_olist(node) + when 'dlist' + convert_dlist(node) + when 'list_item' + convert_list_item(node) + when 'image' + "![#{node.attr('alt')}]" + when 'literal' + decode(node.content) + when 'quote' + "> #{node.content}" + when 'verse' + "```\n#{node.content}\n```" + when 'floating_title' + convert_floating_title(node) + when 'admonition' + convert_admonition(node) + when 'listing' + convert_listing(node) + else + handle_unknown_node(node) + end + end + + private + + def convert_document(node) + result = [] + result << "# " + decode(node.doctitle) if node.header? + result << node.blocks.map { |child| convert(child) }.join("\n\n") + result.join("\n\n") + end + + def convert_section(node) + result = [] + result << "#{('#' * node.level)} " + decode(node.title) + result << node.blocks.map { |child| convert(child) }.join("\n\n") + result.join("\n\n") + end + + def convert_ulist(node) + node.items.map { |item| "* #{convert(item)}" }.join("\n") + end + + def convert_olist(node) + node.items.each_with_index.map { |item, index| "#{index + 1}. #{convert(item)}" }.join("\n") + end + + def convert_dlist(node) + node.items.map { |(terms, descs)| + convert_dlist_terms_and_descs(terms, descs) + }.join("\n") + end + + def convert_dlist_terms_and_descs(terms, descs) + terms_text = terms.map { |term| decode(term.text) }.join(", ") + descs_text = (convert(descs) if not descs.nil?) || "" + "#{terms_text}:: #{descs_text}" + end + + def convert_list_item(node) + content = (decode(node.text) if not node.text.nil?) || "" + if node.blocks? + node.blocks.each do |block| + if block.title? + content += "\n" + decode(block.title) + end + if block.context == :paragraph + content += "\n\n#{convert(block)}" + elsif block.context == :literal + content += "\n\n#{convert(block)}" + else + content += "\n#{convert(block)}" + end + end + end + content + end + + def convert_floating_title(node) + "## " + decode(node.title) + end + + def convert_admonition(node) + type = node.attr('name').upcase + content = node.content + decode("\n[#{type}]\n----\n#{content}\n----\n") + end + + def convert_listing(node) + language = node.attr('language') + content = node.content + ret = "\n```" + if language + ret += decode(language) + end + ret += "\n#{decode(content)}\n```\n" + end + + def handle_unknown_node(node) + if node.respond_to?(:content) + if node.content.is_a?(Array) + node.content.map { |child| convert(child) }.join("\n") + else + decode(node.content) + end + elsif node.respond_to?(:text) + decode(node.text) + else + "" + end + end + + def decode str + unless str.nil? + str = str. + gsub('<', '<'). + gsub('>', '>'). + gsub('+', '+'). # plus sign; alternately could use \c(pl + gsub(' ', ' '). # non-breaking space + gsub('®', '(R)'). # registered trademark + gsub(' ', ' '). # thin space + gsub('–', '-'). # en dash + gsub('—', '-'). # em dash + gsub('‘', %(')). # left single quotation mark + gsub('’', %(')). # right single quotation mark + gsub('“', %(")). # left double quotation mark + gsub('”', %("")). # right double quotation mark + gsub('←', '<-'). # leftwards arrow + gsub('→', '->'). # rightwards arrow + gsub('⇐', '->'). # leftwards double arrow + gsub('⇒', '<-'). # rightwards double arrow + gsub('&', '&'). # literal ampersand (NOTE must take place after any other replacement that includes &) + gsub('\'', %(')). # apostrophe / neutral single quote + rstrip # strip trailing space + end + str + end + + end +end From 54e6eb6389e83f22eb9a3dc67ebcdd46a06463bb Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Mon, 17 Mar 2025 18:31:50 +0200 Subject: [PATCH 2/5] Add unit tests for lightspeed_rag_content.asciidoc This commit adds unit tests for the ligthspeed_rag_content.asciidoc package. Signed-off-by: Lukas Piwowarski --- tests/asciidoc/__init__.py | 0 tests/asciidoc/test__main__.py | 137 ++++++++++++++++++++ tests/asciidoc/test_asciidoc_conveter.py | 151 +++++++++++++++++++++++ 3 files changed, 288 insertions(+) create mode 100644 tests/asciidoc/__init__.py create mode 100644 tests/asciidoc/test__main__.py create mode 100644 tests/asciidoc/test_asciidoc_conveter.py diff --git a/tests/asciidoc/__init__.py b/tests/asciidoc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/asciidoc/test__main__.py b/tests/asciidoc/test__main__.py new file mode 100644 index 00000000..5d48a9e4 --- /dev/null +++ b/tests/asciidoc/test__main__.py @@ -0,0 +1,137 @@ +# Copyright 2025 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +import argparse +import subprocess +import unittest +from pathlib import Path +from unittest.mock import Mock, patch + +from lightspeed_rag_content.asciidoc.__main__ import ( + get_argument_parser, + main_convert, + main_get_structure, +) +from lightspeed_rag_content.asciidoc.asciidoctor_converter import RUBY_ASCIIDOC_DIR + + +class Test__main__(unittest.TestCase): + def setUp(self): + super().setUp() + + self.asciidoctor_cmd = "/usr/bin/asciidoctor" + self.input_file = Path("input.adoc") + self.output_file = Path("output.adoc") + self.text_converter_file = RUBY_ASCIIDOC_DIR.joinpath("asciidoc_text_converter.rb") + self.structure_dumper_file = RUBY_ASCIIDOC_DIR.joinpath("asciidoc_structure_dumper.rb") + + def get_mock_parsed_args(self) -> Mock: + mock_args = Mock() + mock_args.input_file = self.input_file + mock_args.output_file = self.output_file + mock_args.converter_file = self.text_converter_file + mock_args.attributes_file = None + mock_args.target_format = "text" + + return mock_args + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_main_convert(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + mock_args = self.get_mock_parsed_args() + main_convert(mock_args) + + mock_run.assert_called_with( + [ + "/usr/bin/asciidoctor", + "-r", + str(self.text_converter_file.absolute()), + "-b", + "text", + "-o", + str(self.output_file.absolute()), + "--trace", + "--quiet", + str(self.input_file.absolute()), + ], + check=True, + capture_output=True, + ) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_main_convert_incorrect_cmd_error(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + mock_run.side_effect = subprocess.CalledProcessError(cmd=self.asciidoctor_cmd, returncode=1) + mock_args = self.get_mock_parsed_args() + + with self.assertRaises(SystemExit) as e: + main_convert(mock_args) + self.assertNotEqual(e.exception.code, 0) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_main_convert_missing_asciidoctor_cmd(self, mock_which): + mock_which.return_value = "" + mock_args = self.get_mock_parsed_args() + + with self.assertRaises(SystemExit) as e: + main_convert(mock_args) + self.assertNotEqual(e.exception.code, 0) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_main_get_structure(self, mock_which, mock_run): + mock_which.return_value = "/usr/bin/ruby" + mock_args = Mock() + mock_args.input_file = self.input_file + + main_get_structure(mock_args) + mock_run.assert_called_with( + [ + "/usr/bin/ruby", + str(self.structure_dumper_file), + str(self.input_file.absolute()), + ], + check=True, + ) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + def test_main_incorrect_asciidoctor_cmd(self, mock_run): + mock_run.side_effect = subprocess.CalledProcessError(cmd=self.asciidoctor_cmd, returncode=1) + mock_args = Mock() + mock_args.input_file = self.input_file + + with self.assertRaises(SystemExit) as e: + main_get_structure(mock_args) + self.assertNotEqual(e.exception.code, 0) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_main_missing_asciidoctor_cmd(self, mock_which): + mock_which.return_value = "" + mock_args = Mock() + mock_args.input_file = self.input_file + + with self.assertRaises(SystemExit) as e: + with self.assertLogs() as logger: + main_get_structure(mock_args) + self.assertNotEqual(e.exception.code, 0) + + error_msgs = [output for output in logger.output if "ERROR" in output] + self.assertTrue(len(error_msgs) > 0) + + def test_get_argument_parser(self): + args = get_argument_parser() + + self.assertIsInstance(args, argparse.ArgumentParser) diff --git a/tests/asciidoc/test_asciidoc_conveter.py b/tests/asciidoc/test_asciidoc_conveter.py new file mode 100644 index 00000000..a952e9fa --- /dev/null +++ b/tests/asciidoc/test_asciidoc_conveter.py @@ -0,0 +1,151 @@ +# Copyright 2025 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +import unittest +from pathlib import Path +from unittest.mock import Mock, mock_open, patch + +import yaml + +from lightspeed_rag_content.asciidoc.asciidoctor_converter import ( + RUBY_ASCIIDOC_DIR, + AsciidoctorConverter, +) + + +class TestAsciidoctorConverter(unittest.TestCase): + def setUp(self): + super().setUp() + + self.valid_attributes_file = """--- + foo: bar + """ + + self.invalid_attributes_file = """--- + [[] + """ + + self.text_converter_path = RUBY_ASCIIDOC_DIR.joinpath("asciidoc_text_converter.rb") + self.input_file = Path("input.adoc") + self.output_file = Path("output.txt") + self.attributes_file = Path("attributes.yaml") + self.asciidoctor_cmd = "/usr/bin/asciidoctor" + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_convert(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + with patch("builtins.open", mock_open(read_data=self.valid_attributes_file)): + adoc_text_converter = AsciidoctorConverter(attributes_file=self.attributes_file) + adoc_text_converter.convert(self.input_file, self.output_file) + + mock_run.assert_called_with( + [ + self.asciidoctor_cmd, + "-a", + "foo=bar", + "-r", + str(self.text_converter_path.absolute()), + "-b", + "text", + "-o", + str(self.output_file.absolute()), + "--trace", + "--quiet", + str(self.input_file.absolute()), + ], + check=True, + capture_output=True, + ) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_convert_custom_converter(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + custom_converter = Path("custom_converter") + adoc_text_converter = AsciidoctorConverter(converter_file=custom_converter) + adoc_text_converter.convert(self.input_file, self.output_file) + + mock_run.assert_called_with( + [ + self.asciidoctor_cmd, + "-r", + str(custom_converter.absolute()), + "-b", + "text", + "-o", + str(self.output_file.absolute()), + "--trace", + "--quiet", + str(self.input_file.absolute()), + ], + check=True, + capture_output=True, + ) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_convert_overwrite_output_file(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + adoc_text_converter = AsciidoctorConverter() + + mock_output_file = Mock() + mock_output_file.exists.return_value = True + + with self.assertLogs() as logger: + adoc_text_converter.convert(self.input_file, mock_output_file) + warning_msgs = [output for output in logger.output if "WARNING" in output] + self.assertTrue(len(warning_msgs) > 0) + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.subprocess.run") + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test_convert_new_output_file(self, mock_which, mock_run): + mock_which.return_value = self.asciidoctor_cmd + adoc_text_converter = AsciidoctorConverter() + + output_file = Mock() + output_file.exists.return_value = False + output_file.absolute.return_value = "/output.txt" + + adoc_text_converter.convert(self.input_file, output_file) + output_file.parent.mkdir.assert_called_once() + + def test__get_default_converter_file(self): + converter_file = AsciidoctorConverter._get_default_converter_file("text") + self.assertEqual(converter_file, RUBY_ASCIIDOC_DIR.joinpath("asciidoc_text_converter.rb")) + + def test__get_default_converter_file_invalid_format(self): + with self.assertRaises(FileNotFoundError): + AsciidoctorConverter._get_default_converter_file("invalid") + + @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") + def test__get_asciidoctor_path_missing(self, mock_which): + mock_which.return_value = "" + with self.assertRaises(FileNotFoundError): + AsciidoctorConverter() + + def test__get_attribute_list_valid_yaml(self): + with patch("builtins.open", mock_open(read_data=self.valid_attributes_file)) as m: + AsciidoctorConverter._get_attribute_list("valid.yaml") + m.assert_called_once() + + def test__get_attribute_list_invalid_yaml(self): + with patch("builtins.open", mock_open(read_data=self.invalid_attributes_file)): + with self.assertRaises(yaml.YAMLError): + AsciidoctorConverter._get_attribute_list("invalid.yaml") + + def test__get_attribute_list_empty_yaml(self): + with patch("builtins.open", mock_open(read_data="")): + attributes = AsciidoctorConverter._get_attribute_list("non_existing.yaml") + self.assertEqual(attributes, []) From 711c65c5fd965604b28fc43dbfbd560ae4f67b5b Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Fri, 21 Mar 2025 10:26:49 +0200 Subject: [PATCH 3/5] Add support for built-in asciidoctor formats This commit allows using the AsciidoctorConverter to convert AsciiDoc files to target formats that are by default supported by asciidoctor: - html5 - xhtml5 - manpage Signed-off-by: Lukas Piwowarski --- .../asciidoc/__main__.py | 1 + .../asciidoc/asciidoctor_converter.py | 20 ++++++++++++------- tests/asciidoc/test_asciidoc_conveter.py | 12 +++++++---- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/lightspeed_rag_content/asciidoc/__main__.py b/src/lightspeed_rag_content/asciidoc/__main__.py index 76957467..2da28803 100644 --- a/src/lightspeed_rag_content/asciidoc/__main__.py +++ b/src/lightspeed_rag_content/asciidoc/__main__.py @@ -123,6 +123,7 @@ def get_argument_parser() -> argparse.ArgumentParser: required=False, type=str, default="text", + choices=["text", "html5", "xhtml5", "manpage"], help="Target format in which the input file should be saved.", ) diff --git a/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py index 8d4650bc..8f82a704 100644 --- a/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py +++ b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py @@ -75,7 +75,9 @@ def __init__( """Initialize AsciidoctorConverter. Args: - target_format: A format to which input files should be converted. + target_format: + A format to which input files should be converted. These formats + are currently supported: text, html5, xhtml5, manpage. attributes_file: A path pointing to an attributes file. converter_file: An asciidoctor compatible extension. @@ -89,23 +91,27 @@ def __init__( self.target_format = target_format self.attribute_list = self._get_attribute_list(attributes_file) - if converter_file: - self.converter_file = converter_file + if not converter_file: + self.converter_file = self._get_converter_file(target_format) else: - self.converter_file = self._get_default_converter_file(target_format) + self.converter_file = converter_file self.asciidoctor_cmd = self._get_asciidoctor_path() @staticmethod - def _get_default_converter_file(target_format: str) -> Path: - """Return path to asciidoctor Ruby based extension.""" + def _get_converter_file(target_format: str) -> Path | None: + """Return converter file if target_format requires one.""" + asciidoctor_supported_formats = ["html5", "xhtml5", "manpage"] + if target_format in asciidoctor_supported_formats: + return None + converter_files = { "text": "asciidoc_text_converter.rb", } if not (converter_file := converter_files.get(target_format, None)): raise FileNotFoundError( - f"There is no built-in extension for target format: {target_format}" + f"There is no extension available for target format: {target_format}" ) return RUBY_ASCIIDOC_DIR.joinpath(converter_file) diff --git a/tests/asciidoc/test_asciidoc_conveter.py b/tests/asciidoc/test_asciidoc_conveter.py index a952e9fa..6d883f62 100644 --- a/tests/asciidoc/test_asciidoc_conveter.py +++ b/tests/asciidoc/test_asciidoc_conveter.py @@ -121,13 +121,17 @@ def test_convert_new_output_file(self, mock_which, mock_run): adoc_text_converter.convert(self.input_file, output_file) output_file.parent.mkdir.assert_called_once() - def test__get_default_converter_file(self): - converter_file = AsciidoctorConverter._get_default_converter_file("text") + def test__get_converter_file(self): + converter_file = AsciidoctorConverter._get_converter_file("text") self.assertEqual(converter_file, RUBY_ASCIIDOC_DIR.joinpath("asciidoc_text_converter.rb")) - def test__get_default_converter_file_invalid_format(self): + def test__get_converter_file_asciidoctor_built_in_format(self): + converter_file = AsciidoctorConverter._get_converter_file("html5") + self.assertEqual(converter_file, None) + + def test__get_converter_file_invalid_format(self): with self.assertRaises(FileNotFoundError): - AsciidoctorConverter._get_default_converter_file("invalid") + AsciidoctorConverter._get_converter_file("invalid") @patch("lightspeed_rag_content.asciidoc.asciidoctor_converter.shutil.which") def test__get_asciidoctor_path_missing(self, mock_which): From 9b25c9c5fc4b1a8375477223ad081c6f9aad08b4 Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Mon, 24 Mar 2025 17:07:25 +0200 Subject: [PATCH 4/5] Leave string interpolation up to logging --- .../asciidoc/asciidoctor_converter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py index 8f82a704..6ab91da9 100644 --- a/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py +++ b/src/lightspeed_rag_content/asciidoc/asciidoctor_converter.py @@ -123,7 +123,7 @@ def _get_asciidoctor_path() -> str: if not asciidoctor_path: raise FileNotFoundError("asciidoctor executable not found") - LOG.info(f"Using asciidoctor with {asciidoctor_path} path") + LOG.info("Using asciidoctor with %s path", asciidoctor_path) return asciidoctor_path @staticmethod @@ -156,12 +156,13 @@ def convert(self, source_file: Path, destination_file: Path) -> None: subprocess.CalledSubprocessError: If an error occurs when running asciidoctor. """ - LOG.info("Processing: " + str(source_file.absolute())) + LOG.info("Processing: %s", str(source_file.absolute())) if not destination_file.exists(): destination_file.parent.mkdir(parents=True, exist_ok=True) else: LOG.warning( - f"Destination file {destination_file} exists. It will be overwritten!" + "Destination file %s exists. It will be overwritten!", + destination_file, ) command = [self.asciidoctor_cmd] From 3229531f3401d278b1d6476fc47635d9fdda0f25 Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Tue, 25 Mar 2025 11:53:19 +0200 Subject: [PATCH 5/5] Install asciidoctor in rag-content-{cpu,gpu} This commit adds asciidoctor binary into the base image. This allows the consumers of that image to use the lightspeed_rag_content.asciidoc sub-package, as it is heavily dependent on asciidoctor. --- Containerfile.base | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Containerfile.base b/Containerfile.base index d5d95d15..649a9aa8 100644 --- a/Containerfile.base +++ b/Containerfile.base @@ -5,8 +5,7 @@ ARG FLAVOR FROM nvcr.io/nvidia/cuda:12.6.2-devel-ubi9 as gpu-base ARG FLAVOR -RUN dnf install -y python3.11 python3.11-pip libcudnn8 libnccl git && \ - dnf clean all +RUN dnf install -y python3.11 python3.11-pip libcudnn8 libnccl git RUN ln -sf /usr/bin/python3.11 /usr/bin/python ENV LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH @@ -14,6 +13,10 @@ FROM ${FLAVOR}-base as road-core-rag-builder ARG FLAVOR USER 0 +RUN dnf install -y rubygems && \ + dnf clean all && \ + gem install asciidoctor + WORKDIR /rag-content ENV EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2