From 7a284d9ab45722ed601a8505318b19eb1015e154 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 6 Feb 2026 17:37:50 -0500 Subject: [PATCH 1/3] Port unported updates since Machine df7d6e9c0bf1de8cba9462fba89208e6546db8fe --- machine/corpora/__init__.py | 8 +- .../file_paratext_project_file_handler.py | 18 +- ...xt_project_versification_error_detector.py | 4 +- machine/corpora/paratext_project_settings.py | 4 +- ...ject_versification_error_detector_base.py} | 16 +- ...scripture_ref_usfm_parser_handler_base.py} | 69 +++-- machine/corpora/update_usfm_parser_handler.py | 22 +- machine/corpora/usfm_parser.py | 7 + machine/corpora/usfm_parser_state.py | 5 +- machine/corpora/usfm_text_base.py | 4 +- .../zip_paratext_project_file_handler.py | 10 +- ...paratext_project_versification_detector.py | 4 +- machine/scripture/verse_ref.py | 2 - pyproject.toml | 2 +- ...place_markers_usfm_update_block_handler.py | 87 +++++-- .../test_update_usfm_parser_handler.py | 113 +++++++++ tests/corpora/test_usfm_memory_text.py | 238 ++++++++++++++++++ tests/scripture/test_verse_ref.py | 2 - .../data/usfm/Tes/{custom.vrs => Custom.vrs} | 0 ...xt_project_versification_error_detector.py | 4 +- 20 files changed, 552 insertions(+), 67 deletions(-) rename machine/corpora/{paratext_project_versification_error_detector.py => paratext_project_versification_error_detector_base.py} (77%) rename machine/corpora/{scripture_ref_usfm_parser_handler.py => scripture_ref_usfm_parser_handler_base.py} (81%) rename tests/testutils/data/usfm/Tes/{custom.vrs => Custom.vrs} (100%) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 99a69191..d07e52ee 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -28,12 +28,12 @@ from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase -from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector +from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef -from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType +from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType from .scripture_text_corpus import ( ScriptureTextCorpus, create_versification_ref_corpus, @@ -139,7 +139,7 @@ "ParatextProjectSettingsParserBase", "ParatextProjectTermsParserBase", "ParatextProjectTextUpdaterBase", - "ParatextProjectVersificationErrorDetector", + "ParatextProjectVersificationErrorDetectorBase", "ParatextTextCorpus", "parse_usfm", "PlaceMarkersAlignmentInfo", @@ -147,7 +147,7 @@ "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", - "ScriptureRefUsfmParserHandler", + "ScriptureRefUsfmParserHandlerBase", "ScriptureTextCorpus", "ScriptureTextType", "StandardParallelTextCorpus", diff --git a/machine/corpora/file_paratext_project_file_handler.py b/machine/corpora/file_paratext_project_file_handler.py index 8cdc3dd2..a9846645 100644 --- a/machine/corpora/file_paratext_project_file_handler.py +++ b/machine/corpora/file_paratext_project_file_handler.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from typing import BinaryIO, Optional @@ -11,17 +12,28 @@ def __init__(self, project_dir: StrPath) -> None: self._project_dir = Path(project_dir) def exists(self, file_name: str) -> bool: - return (self._project_dir / file_name).is_file() + for actual_file_name in os.listdir(self._project_dir): + if actual_file_name.lower() == file_name.lower(): + return True + return False def open(self, file_name: str) -> BinaryIO: + for actual_file_name in os.listdir(self._project_dir): + if actual_file_name.lower() == file_name.lower(): + return open(self._project_dir / actual_file_name, "rb") return open(self._project_dir / file_name, "rb") def find(self, extension: str) -> Optional[Path]: return next(self._project_dir.glob(f"*{extension}"), None) def create_stylesheet(self, file_name: str) -> UsfmStylesheet: - custom_stylesheet_filename = self._project_dir / "custom.sty" + custom_stylesheet_file_name = "custom.sty" + for actual_file_name in os.listdir(self._project_dir): + if actual_file_name.lower() == custom_stylesheet_file_name: + custom_stylesheet_file_name = actual_file_name + break + custom_stylesheet_path = self._project_dir / custom_stylesheet_file_name return UsfmStylesheet( file_name, - custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None, + custom_stylesheet_path if custom_stylesheet_path.is_file() else None, ) diff --git a/machine/corpora/file_paratext_project_versification_error_detector.py b/machine/corpora/file_paratext_project_versification_error_detector.py index 4e2cdac3..5f451894 100644 --- a/machine/corpora/file_paratext_project_versification_error_detector.py +++ b/machine/corpora/file_paratext_project_versification_error_detector.py @@ -1,10 +1,10 @@ from ..utils.typeshed import StrPath from .file_paratext_project_file_handler import FileParatextProjectFileHandler from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser -from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector +from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase -class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): +class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase): def __init__(self, project_dir: StrPath) -> None: super().__init__( FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse() diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py index ad86b303..5f747257 100644 --- a/machine/corpora/paratext_project_settings.py +++ b/machine/corpora/paratext_project_settings.py @@ -53,9 +53,9 @@ def get_book_file_name(self, book_id: str) -> str: book_part = _get_book_file_name_digits(book_id) + book_id return self.file_name_prefix + book_part + self.file_name_suffix - def get_all_scripture_book_file_names(self) -> Iterable[str]: + def get_all_scripture_book_ids(self) -> Iterable[str]: for book_id in get_scripture_books(): - yield self.get_book_file_name(book_id) + yield book_id def _get_book_file_name_digits(book_id: str) -> str: diff --git a/machine/corpora/paratext_project_versification_error_detector.py b/machine/corpora/paratext_project_versification_error_detector_base.py similarity index 77% rename from machine/corpora/paratext_project_versification_error_detector.py rename to machine/corpora/paratext_project_versification_error_detector_base.py index 64ceee32..4b96be59 100644 --- a/machine/corpora/paratext_project_versification_error_detector.py +++ b/machine/corpora/paratext_project_versification_error_detector_base.py @@ -1,5 +1,6 @@ -from typing import List, Optional, Union +from typing import List, Optional, Set, Union +from ..scripture.canon import book_id_to_number from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -7,7 +8,7 @@ from .usfm_versification_error_detector import UsfmVersificationError, UsfmVersificationErrorDetector -class ParatextProjectVersificationErrorDetector: +class ParatextProjectVersificationErrorDetectorBase: def __init__( self, paratext_project_file_handler: ParatextProjectFileHandler, @@ -20,14 +21,19 @@ def __init__( self._settings = settings def get_usfm_versification_errors( - self, - handler: Optional[UsfmVersificationErrorDetector] = None, + self, handler: Optional[UsfmVersificationErrorDetector] = None, books: Optional[Set[int]] = None ) -> List[UsfmVersificationError]: handler = handler or UsfmVersificationErrorDetector(self._settings) - for file_name in self._settings.get_all_scripture_book_file_names(): + for book_id in self._settings.get_all_scripture_book_ids(): + + file_name = self._settings.get_book_file_name(book_id) + if not self._paratext_project_file_handler.exists(file_name): continue + if books is not None and not book_id_to_number(book_id) in books: + continue + with self._paratext_project_file_handler.open(file_name) as sfm_file: usfm: str = sfm_file.read().decode(self._settings.encoding) try: diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler_base.py similarity index 81% rename from machine/corpora/scripture_ref_usfm_parser_handler.py rename to machine/corpora/scripture_ref_usfm_parser_handler_base.py index efd2962d..fc88dcf1 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler_base.py @@ -22,10 +22,14 @@ class ScriptureTextType(Enum): def _is_embed_style(marker: Optional[str]) -> bool: - return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z")) + return marker is not None and marker.strip("*") in _EMBED_STYLES -class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): +def is_private_use_marker(marker: str): + return marker is not None and marker.startswith("z") + + +class ScriptureRefUsfmParserHandlerBase(UsfmParserHandler, ABC): def __init__(self) -> None: self._cur_verse_ref: VerseRef = VerseRef() self._cur_elements_stack: List[ScriptureElement] = [] @@ -46,22 +50,29 @@ def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number: def verse( self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] ) -> None: - if state.verse_ref == self._cur_verse_ref and not self._duplicate_verse: - self._end_verse_text(state, self._create_verse_refs()) - # ignore duplicate verses - self._duplicate_verse = True + # Non-latin numbers are implicitly handled + + if state.chapter_has_verse_zero and state.verse_ref.verse_num == 0: + # Fall through for the special case of verse 0 being specified in the USFM + pass + elif state.verse_ref == self._cur_verse_ref and not self._duplicate_verse: + if state.verse_ref.verse_num > 0: + self._end_verse_text(state, self._create_verse_refs()) + # ignore duplicate verses + self._duplicate_verse = True + return elif are_overlapping_verse_ranges(verse1=number, verse2=self._cur_verse_ref.verse): # merge overlapping verse ranges in to one range verse_ref: VerseRef = self._cur_verse_ref.copy() verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse) self._update_verse_ref(verse_ref, marker) + return + if self._current_text_type == ScriptureTextType.NONVERSE: + self._end_non_verse_text_wrapper(state) else: - if self._current_text_type == ScriptureTextType.NONVERSE: - self._end_non_verse_text_wrapper(state) - elif self._current_text_type == ScriptureTextType.VERSE: - self._end_verse_text_wrapper(state) - self._update_verse_ref(state.verse_ref, marker) - self._start_verse_text_wrapper(state) + self._end_verse_text_wrapper(state) + self._update_verse_ref(state.verse_ref, marker) + self._start_verse_text_wrapper(state) def start_para( self, @@ -70,6 +81,10 @@ def start_para( unknown: Optional[bool], attributes: Optional[Sequence[UsfmAttribute]], ) -> None: + # ignore private-use markers + if is_private_use_marker(marker): + return + if self._cur_verse_ref.is_default: self._update_verse_ref(state.verse_ref, marker) if not state.is_verse_text: @@ -77,6 +92,10 @@ def start_para( self._start_non_verse_text_wrapper(state) def end_para(self, state: UsfmParserState, marker: str) -> None: + # ignore private-use markers + if is_private_use_marker(marker): + return + if self._current_text_type == ScriptureTextType.NONVERSE: self._end_parent_element() self._end_non_verse_text_wrapper(state) @@ -126,6 +145,10 @@ def opt_break(self, state: UsfmParserState) -> None: def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: + # ignore private-use markers + if is_private_use_marker(marker): + return + # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment self._check_convert_verse_para_to_non_verse(state) @@ -135,6 +158,10 @@ def start_char( def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: + # ignore private-use markers + if is_private_use_marker(marker): + return + if _is_embed_style(marker): self._end_embed_text_wrapper(state) @@ -162,9 +189,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None: self._start_verse_text(state, self._create_verse_refs()) def _end_verse_text_wrapper(self, state: UsfmParserState) -> None: - if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0: + if not self._duplicate_verse and (self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero): self._end_verse_text(state, self._create_verse_refs()) - if self._cur_verse_ref.verse_num > 0: + if self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero: self._cur_text_type_stack.pop() def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None: @@ -177,7 +204,17 @@ def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None: self._cur_text_type_stack.pop() def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None: - if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref): + if ( + self._cur_verse_ref.verse_num == 0 + and verse_ref.verse_num == 0 + and not verse_ref.has_multiple + and marker == "v" + ): + # As the verse 0 marker appears within the middle of verse 0, + # we should not break the position of current element stack by clearing it. + # Instead, we just need to pop the current element off the stack. + self._cur_elements_stack.pop() + elif not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref): self._cur_elements_stack.clear() self._cur_elements_stack.append(ScriptureElement(0, marker)) self._cur_verse_ref = verse_ref.copy() @@ -239,6 +276,8 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None and para_tag.marker != "tr" and state.is_verse_para and self._cur_verse_ref.verse_num == 0 + and not state.chapter_has_verse_zero + and not is_private_use_marker(para_tag.marker) ): self._start_parent_element(para_tag.marker) self._start_non_verse_text_wrapper(state) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index fafabcc2..903ab3d1 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -3,7 +3,7 @@ from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification from .scripture_ref import ScriptureRef -from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType +from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_tag import UsfmTextType @@ -38,7 +38,11 @@ def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[d self.metadata = metadata -class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler): +def sanitize_verse_data(verse_data: str) -> str: + return verse_data.replace("\u200F", "") + + +class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandlerBase): def __init__( self, rows: Optional[Sequence[UpdateUsfmRow]] = None, @@ -319,10 +323,16 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe self._end_update_block(state, [scripture_ref]) def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + # If this embed is outside an update block, create an update block just for this embed + embed_outside_of_block = len(self._update_block_stack) == 0 + if embed_outside_of_block: + self._start_update_block([scripture_ref]) self._update_block_stack[-1].add_embed( self._embed_tokens, marked_for_removal=self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP ) self._embed_tokens.clear() + if embed_outside_of_block: + self._end_update_block(state, [scripture_ref]) def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): @@ -349,6 +359,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str] row_texts: List[str] = [] row_metadata = None source_index: int = 0 + + # handle the special case of verse 0, which although first in the rows, + # it will be retrieved some of other segments in the verse. + if len(seg_scr_refs) > 0 and seg_scr_refs[0].verse_num == 0 and len(seg_scr_refs[0].path) == 0: + self._verse_row_index = 0 + while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs): compare: int = 0 row = self._rows[self._verse_rows[self._verse_row_index]] @@ -378,6 +394,8 @@ def _collect_updatable_tokens(self, state: UsfmParserState) -> None: self._use_updated_text() while self._token_index <= state.index + state.special_token_count: token = state.tokens[self._token_index] + if token.type == UsfmTokenType.VERSE and token.data is not None: + token.data = sanitize_verse_data(token.data) if self._current_text_type == ScriptureTextType.EMBED: self._embed_tokens.append(token) elif ( diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index e13acfed..a37d5396 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -223,6 +223,8 @@ def process_token(self) -> bool: verse_ref = self.state.verse_ref verse_ref.chapter = token.data verse_ref.verse_num = 0 + self.state.chapter_has_verse_zero = False + # Verse offset is not zeroed for chapter 1, as it is part of intro if verse_ref.chapter_num != 1: self.state.verse_offset = 0 @@ -261,7 +263,12 @@ def process_token(self) -> bool: assert token.data is not None verse_ref = self.state.verse_ref + prev_verse_num = verse_ref.verse_num verse_ref.verse = token.data + if verse_ref.verse_num == 0: # This token is \v 0 + self.state.chapter_has_verse_zero = True + elif verse_ref.verse_num == -1: # Ignore invalid verse numbers + verse_ref.verse_num = prev_verse_num self.state.verse_offset = 0 if self.handler is not None: diff --git a/machine/corpora/usfm_parser_state.py b/machine/corpora/usfm_parser_state.py index 3d0b9e82..f0ef8a74 100644 --- a/machine/corpora/usfm_parser_state.py +++ b/machine/corpora/usfm_parser_state.py @@ -37,6 +37,7 @@ def __init__(self, stylesheet: UsfmStylesheet, versification: Versification, tok self._tokens = tokens self.index = -1 self.special_token = False + self.chapter_has_verse_zero = False self._special_token_count: int = 0 @property @@ -108,8 +109,8 @@ def is_verse_para(self) -> bool: @property def is_verse_text(self) -> bool: - # anything before verse 1 is not verse text - if self.verse_ref.verse_num == 0: + # anything before verse 1 is not verse text, unless the USFM specified verse 0 + if self.verse_ref.verse_num == 0 and not self.chapter_has_verse_zero: return False # Sidebars and notes are not verse text diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 830802ca..ee400909 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -7,7 +7,7 @@ from ..utils.string_utils import has_sentence_ending from .corpora_utils import gen from .scripture_ref import ScriptureRef -from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType +from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType from .scripture_text import ScriptureText from .stream_container import StreamContainer from .text_row import TextRow @@ -76,7 +76,7 @@ def _read_usfm(self) -> str: return reader.read() -class _TextRowCollector(ScriptureRefUsfmParserHandler): +class _TextRowCollector(ScriptureRefUsfmParserHandlerBase): def __init__(self, text: UsfmTextBase) -> None: super().__init__() diff --git a/machine/corpora/zip_paratext_project_file_handler.py b/machine/corpora/zip_paratext_project_file_handler.py index f97c46d5..3f430048 100644 --- a/machine/corpora/zip_paratext_project_file_handler.py +++ b/machine/corpora/zip_paratext_project_file_handler.py @@ -13,7 +13,10 @@ def __init__(self, archive: ZipFile) -> None: self._archive = archive def exists(self, file_name: str) -> bool: - return file_name in self._archive.namelist() + for actual_entry_name in self._archive.namelist(): + if actual_entry_name.lower() == file_name.lower(): + return True + return False def find(self, extension: str) -> Optional[str]: for entry in self._archive.namelist(): @@ -22,8 +25,9 @@ def find(self, extension: str) -> Optional[str]: return None def open(self, file_name: str) -> Optional[BinaryIO]: - if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(file_name)) + for actual_entry_name in self._archive.namelist(): + if actual_entry_name.lower() == file_name.lower(): + return BytesIO(self._archive.read(actual_entry_name)) return None def create_stylesheet(self, file_name: str) -> UsfmStylesheet: diff --git a/machine/corpora/zip_paratext_project_versification_detector.py b/machine/corpora/zip_paratext_project_versification_detector.py index cf4bf66e..ccb287c2 100644 --- a/machine/corpora/zip_paratext_project_versification_detector.py +++ b/machine/corpora/zip_paratext_project_versification_detector.py @@ -1,10 +1,10 @@ from zipfile import ZipFile -from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector +from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser -class ZipParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): +class ZipParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase): def __init__(self, archive: ZipFile): super().__init__(ZipParatextProjectFileHandler(archive), ZipParatextProjectSettingsParser(archive).parse()) diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index f35ea0fe..86ca1407 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -149,8 +149,6 @@ def verse_num(self) -> int: @verse_num.setter def verse_num(self, value: int) -> None: - if value < 0: - raise ValueError("The verse number cannot be negative.") self._verse_num = value self._verse = None diff --git a/pyproject.toml b/pyproject.toml index d337c7f5..c3a19faa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ reportMissingModuleSource = false [tool.poetry] name = "sil-machine" -version = "1.8.5" +version = "1.8.6" description = "A natural language processing library that is focused on providing tools for resource-poor languages." license = "MIT" authors = ["SIL International"] diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 5f75c635..cc4c7385 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -50,7 +50,7 @@ def test_paragraph_markers() -> None: \p Este texto está en inglés \p y esta prueba es para marcadores de párrafo. """ - assess(target, result) + assert_usfm_equals(target, result) def test_style_markers() -> None: @@ -81,7 +81,7 @@ def test_style_markers() -> None: \c 1 \v 1 Esta es la \w primera\w* oración. Este texto está en \w inglés\w* y esta prueba es \w para\w* marcadores de estilo. """ - assess(target, result) + assert_usfm_equals(target, result) align_info = PlaceMarkersAlignmentInfo( source_tokens=[t for t in TOKENIZER.tokenize(source)], @@ -103,7 +103,7 @@ def test_style_markers() -> None: \c 1 \v 1 Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo. """ - assess(target, result) + assert_usfm_equals(target, result) # NOTE: Not currently updating embeds, will need to change test when we do @@ -143,7 +143,7 @@ def test_embeds() -> None: \v 5 New verse 5 \f \fr 1.5 \ft A \+w stylish\+w* note \f* \v 6 New verse 6 \f \fr 1.6 \ft Another \+w stylish\+w* note \f* """ - assess(target, result) + assert_usfm_equals(target, result) target = update_usfm( rows, @@ -160,7 +160,7 @@ def test_embeds() -> None: \v 5 New verse 5 \v 6 New verse 6 """ - assess(target, result) + assert_usfm_equals(target, result) def test_trailing_empty_paragraphs() -> None: @@ -200,7 +200,7 @@ def test_trailing_empty_paragraphs() -> None: \b \q1 """ - assess(target, result) + assert_usfm_equals(target, result) def test_headers() -> None: @@ -283,7 +283,7 @@ def test_headers() -> None: \v 3 Y \s1 Updated header """ - assess(target, result) + assert_usfm_equals(target, result) def test_consecutive_markers() -> None: @@ -320,7 +320,7 @@ def test_consecutive_markers() -> None: \v 1 New verse 1 \p \qt \+w WORD\+w*\qt* """ - assess(target, result) + assert_usfm_equals(target, result) def test_verse_ranges() -> None: @@ -356,7 +356,7 @@ def test_verse_ranges() -> None: \v 1-5 New verse range text \p new paragraph 2 """ - assess(target, result) + assert_usfm_equals(target, result) def test_no_update() -> None: @@ -392,7 +392,7 @@ def test_no_update() -> None: \c 1 \v 1 New paragraph 1 New paragraph 2 """ - assess(target, result) + assert_usfm_equals(target, result) # No alignment rows = [ @@ -422,7 +422,7 @@ def test_no_update() -> None: \v 1 New paragraph 1 New paragraph 2 \p """ - assess(target, result) + assert_usfm_equals(target, result) # No text update rows = [] @@ -437,7 +437,7 @@ def test_no_update() -> None: \v 1 Old paragraph 1 \p Old paragraph 2 """ - assess(target, result) + assert_usfm_equals(target, result) def test_split_tokens() -> None: @@ -475,7 +475,7 @@ def test_split_tokens() -> None: \p words split \p words split """ - assess(target, result) + assert_usfm_equals(target, result) def test_no_text() -> None: @@ -510,7 +510,7 @@ def test_no_text() -> None: \c 1 \v 1 \w \w* """ - assess(target, result) + assert_usfm_equals(target, result) def test_consecutive_substring() -> None: @@ -546,7 +546,7 @@ def test_consecutive_substring() -> None: \v 1 string \p ring """ - assess(target, result) + assert_usfm_equals(target, result) def test_verses_out_of_order() -> None: @@ -597,7 +597,7 @@ def test_verses_out_of_order() -> None: \v 1 new verse 1 \p new paragraph 2 """ - assess(target, result) + assert_usfm_equals(target, result) def test_strip_paragraphs_with_header() -> None: @@ -638,7 +638,58 @@ def test_strip_paragraphs_with_header() -> None: \p \v 2 verse 2 """ - assess(target, result) + assert_usfm_equals(target, result) + + +def test_support_verse_zero(): + # Note: Verse 0 has an empty paragraph as the paragraph occurs before verse text, + # so is not included in the verse text as it is for the paragraphs for the other verses. + rows = [ + UpdateUsfmRow(scr_ref("MAT 1:0"), "New verse 0"), + UpdateUsfmRow(scr_ref("MAT 1:0/1:mt"), "New book header"), + UpdateUsfmRow(scr_ref("MAT 1:0/2:s"), "New chapter header"), + UpdateUsfmRow(scr_ref("MAT 1:0/3:p"), ""), + UpdateUsfmRow(scr_ref("MAT 1:0/4:ms"), "New major section header"), + UpdateUsfmRow(scr_ref("MAT 1:0/5:s"), "New section header 1"), + UpdateUsfmRow(scr_ref("MAT 1:1"), "New verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:1/1:s"), "New section header 2"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "New verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "New verse 3"), + ] + usfm = r"""\id MAT +\mt Old book header +\c 1 +\s Old chapter header +\p +\v 0 Old verse 0 +\ms Old major section header +\s Old section header 1 +\p +\v 1 Old verse 1 +\s Old section header 2 +\p +\v 2 Old verse 2 +\v 3 Old verse 3 +""" + + target = update_usfm(rows, usfm, update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()]) + + result = r"""\id MAT +\mt New book header +\c 1 +\s New chapter header +\p +\v 0 New verse 0 +\ms New major section header +\s New section header 1 +\p +\v 1 New verse 1 +\s New section header 2 +\p +\v 2 New verse 2 +\v 3 New verse 3 +""" + assert_usfm_equals(target, result) def scr_ref(*refs: str) -> List[ScriptureRef]: @@ -683,7 +734,7 @@ def update_usfm( return updater.get_usfm() -def assess(target: Optional[str], truth: str) -> None: +def assert_usfm_equals(target: Optional[str], truth: str) -> None: assert target is not None for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): assert target_line.strip() == truth_line.strip() diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 202f5223..a9c1cdc1 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -916,6 +916,42 @@ def test_update_block_verse_range() -> None: ) +def test_update_block_verse_range_right_to_left_marker() -> None: + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1", "MAT 1:2", "MAT 1:3"), + str("Update 1-3"), + ), + ] + usfm = ( + r"""\id MAT - Test +\c 1 +\v 1""" + + "\u200f" + + """-3 verse 1 through 3 +""" + ) + + update_block_handler = _TestUsfmUpdateBlockHandler() + updated_usfm = update_usfm( + rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] + ) + expected_usfm = r"""\id MAT - Test +\c 1 +\v 1-3 Update 1-3 +""" + assert_usfm_equals(updated_usfm, expected_usfm) + assert len(update_block_handler.blocks) == 1 + + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + ["MAT 1:1", "MAT 1:2", "MAT 1:3"], + (UsfmUpdateBlockElementType.TEXT, "Update 1-3 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse 1 through 3 ", True), + ) + + def test_update_block_footnote_preserve_embeds() -> None: rows = [ UpdateUsfmRow( @@ -1381,6 +1417,83 @@ def test_pass_remark(): assert_usfm_equals(target, result) +def test_update_block_footnote_in_published_chapter_number(): + rows = [UpdateUsfmRow(scr_ref("ESG 1:0/2:s"), "Update 1")] + usfm = r"""\id ESG - Test +\c 1 +\cp A \f + \fr A.1-3: \ft Some note.\f* +\s Heading 1 +""" + update_block_handler = _TestUsfmUpdateBlockHandler() + target = update_usfm( + rows, + usfm, + update_block_handlers=[update_block_handler], + text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + ) + + result = r"""\id ESG +\c 1 +\cp A \f + \fr A.1-3: \ft Some note.\f* +\s Update 1 +""" + assert_usfm_equals(target, result) + + assert len(update_block_handler.blocks) == 2 + assert_update_block_equals( + update_block_handler.blocks[0], + ["ESG 1:0/1:f"], + (UsfmUpdateBlockElementType.EMBED, r"\f + \fr A.1-3: \ft Some note.\f*", False), + ) + assert_update_block_equals( + update_block_handler.blocks[1], + ["ESG 1:0/2:s"], + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Heading 1 ", True), + ) + + +def test_update_block_footnote_at_start_of_chapter_with_preceding_text(): + rows = [UpdateUsfmRow(scr_ref("ESG 1:0/2:s"), "Update 1")] + usfm = r"""\id ESG - Test +\c 1 +Text 1\f + \fr A.1-3: \ft Some note.\f* +\s Heading 1 +""" + update_block_handler = _TestUsfmUpdateBlockHandler() + target = update_usfm( + rows, + usfm, + update_block_handlers=[update_block_handler], + text_behavior=UpdateUsfmTextBehavior.PREFER_NEW, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + ) + + result = r"""\id ESG - Test +\c 1 Text 1\f + \fr A.1-3: \ft Some note.\f* +\s Update 1 +""" + assert_usfm_equals(target, result) + + assert len(update_block_handler.blocks) == 2 + assert_update_block_equals( + update_block_handler.blocks[0], + ["ESG 1:0/1:f"], + (UsfmUpdateBlockElementType.EMBED, r"\f + \fr A.1-3: \ft Some note.\f*", False), + ) + assert_update_block_equals( + update_block_handler.blocks[1], + ["ESG 1:0/2:s"], + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Heading 1 ", True), + ) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index 183eb418..aa89412d 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -208,6 +208,27 @@ def test_get_rows_paragraph_before_nonverse_paragraph() -> None: assert rows[2].text == "header" +def test_get_rows_verse_zero(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\h +\mt +\c 1 +\p \v 0 +\s +\p \v 1 Verse one. +""" + ) + + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0") + assert rows[0].text == "" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:1") + assert rows[1].text == "Verse one." + + def test_get_rows_style_starting_nonverse_paragraph_after_empty_paragraph() -> None: rows: List[TextRow] = get_rows( r"""\id MAT - Test @@ -227,6 +248,223 @@ def test_get_rows_style_starting_nonverse_paragraph_after_empty_paragraph() -> N assert rows[2].text == "\\w header\\w*" +def test_get_rows_verse_zero_with_text(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\h +\mt +\c 1 +\p \v 0 Verse zero. +\s +\p \v 1 Verse one. +""" + ) + + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0") + assert rows[0].text == "Verse zero." + + assert rows[1].ref == ScriptureRef.parse("MAT 1:1") + assert rows[1].text == "Verse one." + + +def test_get_rows_private_use_marker(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test English Apocrypha +\zmt Ignore this paragraph +\mt1 Test English Apocrypha +\pc Copyright Statement \zimagecopyrights +\pc Further copyright statements +""", + include_all_text=True, + ) + + assert len(rows) == 3, str.join(",", [tr.text for tr in rows]) + + assert rows[1].ref == ScriptureRef.parse("MAT 1:0/2:pc") + assert rows[1].text == "Copyright Statement" + + +def test_get_rows_verse_range_with_right_to_left_marker(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\h +\mt +\c 1 +\v 1""" + + "\u200f" + + r"""-2 Verse one and two. +""" + ) + + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:1") + assert rows[0].text == "Verse one and two." + + assert rows[1].ref == ScriptureRef.parse("MAT 1:2") + + +def test_get_rows_non_latin_verse_number(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\p +\v १ Verse 1 +\v 3,৪ Verses 3 and 4 +\p +""", + include_all_text=True, + ) + + assert len(rows) == 4, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:p") + assert rows[0].text == "" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:1") + assert rows[1].text == "Verse 1" + + assert rows[2].ref == ScriptureRef.parse("MAT 1:3") + assert rows[2].text == "Verses 3 and 4" + + assert rows[3].ref == ScriptureRef.parse("MAT 1:৪") + assert rows[3].text == "" + + +def test_get_rows_empty_verse_number(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\p +\v +\b +""", + include_all_text=True, + ) + + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:p") + assert rows[0].text == "" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:0/2:b") + assert rows[1].text == "" + + +def test_get_rows_multiple_empty_verse_numbers(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\p +\v +\p +\v +\p +\v +\p +""", + include_all_text=True, + ) + + assert len(rows) == 4, str.join(",", [tr.text for tr in rows]) + + for i, row in enumerate(rows): + assert row.ref == ScriptureRef.parse(f"MAT 1:0/{i+1}:p") + assert row.text == "" + + +def test_get_rows_empty_verse_number_with_text(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\s heading text +\v \vn 1 verse text +""", + include_all_text=True, + ) + + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:s") + assert rows[0].text == "heading text" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:0/2:vn") + assert rows[1].text == "1 verse text" + + +def test_get_rows_empty_verse_number_mid_verse(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\p +\v 1 verse 1 text +\v +\v 2 verse 2 text +""", + include_all_text=True, + ) + + assert len(rows) == 3, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:p") + assert rows[0].text == "" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:1") + assert rows[1].text == "verse 1 text" + + assert rows[2].ref == ScriptureRef.parse("MAT 1:2") + assert rows[2].text == "verse 2 text" + + +def test_get_rows_invalid_verse_numbers(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\p +\v BK1 text goes here +\v BK 2 text goes here +\v BK 3 text goes here +\v BK 4 text goes here +""", + include_all_text=True, + ) + + assert len(rows) == 1, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:p") + assert rows[0].text == "text goes here 2 text goes here 3 text goes here 4 text goes here" + + +def test_get_rows_incomplete_verse_range(): + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\s heading text +\p +\q1 +\v 1, +\q1 verse 1 text +""", + include_all_text=True, + ) + + assert len(rows) == 4, str.join(",", [tr.text for tr in rows]) + + assert rows[0].ref == ScriptureRef.parse("MAT 1:0/1:s") + assert rows[0].text == "heading text" + + assert rows[1].ref == ScriptureRef.parse("MAT 1:0/2:p") + assert rows[1].text == "" + + assert rows[2].ref == ScriptureRef.parse("MAT 1:1/3:q1") + assert rows[2].text == "" + + assert rows[3].ref == ScriptureRef.parse("MAT 1:1/4:q1") + assert rows[3].text == "verse 1 text" + + def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]: text = UsfmMemoryText( UsfmStylesheet("usfm.sty"), diff --git a/tests/scripture/test_verse_ref.py b/tests/scripture/test_verse_ref.py index 0b0a9ed0..911225ad 100644 --- a/tests/scripture/test_verse_ref.py +++ b/tests/scripture/test_verse_ref.py @@ -189,8 +189,6 @@ def test_invalid() -> None: VerseRef(LAST_BOOK + 1, 1, 1) with raises(ValueError): VerseRef(2, -42, 1) - with raises(ValueError): - VerseRef(2, 1, -4) with raises(ValueError): VerseRef.from_string("MAT 1:") with raises(ValueError): diff --git a/tests/testutils/data/usfm/Tes/custom.vrs b/tests/testutils/data/usfm/Tes/Custom.vrs similarity index 100% rename from tests/testutils/data/usfm/Tes/custom.vrs rename to tests/testutils/data/usfm/Tes/Custom.vrs diff --git a/tests/testutils/memory_paratext_project_versification_error_detector.py b/tests/testutils/memory_paratext_project_versification_error_detector.py index 62911113..512ee930 100644 --- a/tests/testutils/memory_paratext_project_versification_error_detector.py +++ b/tests/testutils/memory_paratext_project_versification_error_detector.py @@ -1,10 +1,10 @@ from typing import Dict, Optional -from machine.corpora import ParatextProjectSettings, ParatextProjectVersificationErrorDetector +from machine.corpora import ParatextProjectSettings, ParatextProjectVersificationErrorDetectorBase from .memory_paratext_project_file_handler import DefaultParatextProjectSettings, MemoryParatextProjectFileHandler -class MemoryParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector): +class MemoryParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase): def __init__(self, settings: Optional[ParatextProjectSettings], files: Dict[str, str]) -> None: super().__init__(MemoryParatextProjectFileHandler(files), settings or DefaultParatextProjectSettings()) From 55be044639010b9d32eda95ff7fea2ea44e94398 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 9 Feb 2026 19:44:24 -0500 Subject: [PATCH 2/3] Additional changes for consistency with Machine --- .../file_paratext_project_file_handler.py | 25 ++++++++++--------- .../corpora/paratext_backup_terms_corpus.py | 3 ++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/machine/corpora/file_paratext_project_file_handler.py b/machine/corpora/file_paratext_project_file_handler.py index a9846645..7482d448 100644 --- a/machine/corpora/file_paratext_project_file_handler.py +++ b/machine/corpora/file_paratext_project_file_handler.py @@ -12,28 +12,29 @@ def __init__(self, project_dir: StrPath) -> None: self._project_dir = Path(project_dir) def exists(self, file_name: str) -> bool: - for actual_file_name in os.listdir(self._project_dir): - if actual_file_name.lower() == file_name.lower(): - return True - return False + return self._get_file_name(file_name) is not None def open(self, file_name: str) -> BinaryIO: - for actual_file_name in os.listdir(self._project_dir): - if actual_file_name.lower() == file_name.lower(): - return open(self._project_dir / actual_file_name, "rb") + actual_file_name = self._get_file_name(file_name) + if actual_file_name is not None: + file_name = actual_file_name return open(self._project_dir / file_name, "rb") def find(self, extension: str) -> Optional[Path]: return next(self._project_dir.glob(f"*{extension}"), None) def create_stylesheet(self, file_name: str) -> UsfmStylesheet: - custom_stylesheet_file_name = "custom.sty" - for actual_file_name in os.listdir(self._project_dir): - if actual_file_name.lower() == custom_stylesheet_file_name: - custom_stylesheet_file_name = actual_file_name - break + custom_stylesheet_file_name = self._get_file_name("custom.sty") + if custom_stylesheet_file_name is None: + custom_stylesheet_file_name = "custom.sty" custom_stylesheet_path = self._project_dir / custom_stylesheet_file_name return UsfmStylesheet( file_name, custom_stylesheet_path if custom_stylesheet_path.is_file() else None, ) + + def _get_file_name(self, case_insensitive_file_name: str) -> Optional[str]: + for actual_file_name in os.listdir(self._project_dir): + if actual_file_name.lower() == case_insensitive_file_name.lower(): + return actual_file_name + return None diff --git a/machine/corpora/paratext_backup_terms_corpus.py b/machine/corpora/paratext_backup_terms_corpus.py index ee80a8e8..75e32b0f 100644 --- a/machine/corpora/paratext_backup_terms_corpus.py +++ b/machine/corpora/paratext_backup_terms_corpus.py @@ -29,8 +29,9 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g text = MemoryText( text_id, [ - TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD) + TextRow(text_id, key_term.id, [rendering], content_type=TextRowContentType.WORD) for key_term in key_terms + for rendering in key_term.renderings ], ) self._add_text(text) From a8544a803f3058d0894447de44be1465d229cc6f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 9 Feb 2026 19:50:42 -0500 Subject: [PATCH 3/3] Address reviewer comments --- .../scripture_ref_usfm_parser_handler_base.py | 12 ++++++------ machine/corpora/update_usfm_parser_handler.py | 4 ++-- machine/scripture/verse_ref.py | 2 ++ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/machine/corpora/scripture_ref_usfm_parser_handler_base.py b/machine/corpora/scripture_ref_usfm_parser_handler_base.py index fc88dcf1..30000595 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler_base.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler_base.py @@ -25,7 +25,7 @@ def _is_embed_style(marker: Optional[str]) -> bool: return marker is not None and marker.strip("*") in _EMBED_STYLES -def is_private_use_marker(marker: str): +def _is_private_use_marker(marker: str) -> bool: return marker is not None and marker.startswith("z") @@ -82,7 +82,7 @@ def start_para( attributes: Optional[Sequence[UsfmAttribute]], ) -> None: # ignore private-use markers - if is_private_use_marker(marker): + if _is_private_use_marker(marker): return if self._cur_verse_ref.is_default: @@ -93,7 +93,7 @@ def start_para( def end_para(self, state: UsfmParserState, marker: str) -> None: # ignore private-use markers - if is_private_use_marker(marker): + if _is_private_use_marker(marker): return if self._current_text_type == ScriptureTextType.NONVERSE: @@ -146,7 +146,7 @@ def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: # ignore private-use markers - if is_private_use_marker(marker): + if _is_private_use_marker(marker): return # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment @@ -159,7 +159,7 @@ def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: # ignore private-use markers - if is_private_use_marker(marker): + if _is_private_use_marker(marker): return if _is_embed_style(marker): @@ -277,7 +277,7 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None and state.is_verse_para and self._cur_verse_ref.verse_num == 0 and not state.chapter_has_verse_zero - and not is_private_use_marker(para_tag.marker) + and not _is_private_use_marker(para_tag.marker) ): self._start_parent_element(para_tag.marker) self._start_non_verse_text_wrapper(state) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 903ab3d1..9d95850c 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -38,7 +38,7 @@ def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[d self.metadata = metadata -def sanitize_verse_data(verse_data: str) -> str: +def _sanitize_verse_data(verse_data: str) -> str: return verse_data.replace("\u200F", "") @@ -395,7 +395,7 @@ def _collect_updatable_tokens(self, state: UsfmParserState) -> None: while self._token_index <= state.index + state.special_token_count: token = state.tokens[self._token_index] if token.type == UsfmTokenType.VERSE and token.data is not None: - token.data = sanitize_verse_data(token.data) + token.data = _sanitize_verse_data(token.data) if self._current_text_type == ScriptureTextType.EMBED: self._embed_tokens.append(token) elif ( diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index 86ca1407..f35ea0fe 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -149,6 +149,8 @@ def verse_num(self) -> int: @verse_num.setter def verse_num(self, value: int) -> None: + if value < 0: + raise ValueError("The verse number cannot be negative.") self._verse_num = value self._verse = None