From e6886554e1c0bceb3f0164d88f4537f5c2ba94b1 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 8 Mar 2026 07:51:26 +0100 Subject: [PATCH 1/8] feat(ocr): replace flat ImageRegion model with nested Page/Block/Line/Word tree Preserves hierarchical structure from each provider's API instead of discarding it during conversion. Adds BoundingBox::enclosing() to nvisy-core and updates all six provider backends to build the tree. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-core/src/math/bounding_box.rs | 10 + crates/nvisy-ocr/src/backend/mod.rs | 8 +- crates/nvisy-ocr/src/backend/output.rs | 192 +++++++------- crates/nvisy-ocr/src/engine/mod.rs | 8 +- crates/nvisy-ocr/src/lib.rs | 2 +- crates/nvisy-ocr/src/prelude.rs | 4 +- .../src/provider/aws_textract/backend.rs | 238 +++++++++++++++--- .../src/provider/azure_docai/backend.rs | 46 +++- .../src/provider/datalab_surya/backend.rs | 63 ++++- .../src/provider/google_vision/backend.rs | 129 +++++++--- .../src/provider/mindee_doctr/backend.rs | 49 +++- .../src/provider/paddle_paddlex/backend.rs | 58 ++++- 12 files changed, 606 insertions(+), 201 deletions(-) diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs index 7a86d0ad..29ec65a5 100644 --- a/crates/nvisy-core/src/math/bounding_box.rs +++ b/crates/nvisy-core/src/math/bounding_box.rs @@ -108,6 +108,16 @@ impl BoundingBox { if union == 0.0 { 0.0 } else { inter / union } } + /// Returns the smallest box enclosing all boxes in the iterator. + /// + /// Returns [`BoundingBox::default()`] if the iterator is empty. + pub fn enclosing<'a>(mut iter: impl Iterator) -> BoundingBox { + match iter.next() { + None => BoundingBox::default(), + Some(first) => iter.fold(*first, |acc, b| acc.union(b)), + } + } + /// Convert to integer pixel coordinates by rounding each field. pub fn to_u32(&self) -> BoundingBoxU32 { BoundingBoxU32 { diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs index e9558a0f..936668ad 100644 --- a/crates/nvisy-ocr/src/backend/mod.rs +++ b/crates/nvisy-ocr/src/backend/mod.rs @@ -5,7 +5,7 @@ mod output; pub use input::{ImageFormat, ImageInput}; use nvisy_core::Error; -pub use output::{ImageOutput, ImageRegion, TextLevel}; +pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word}; use reqwest_middleware::reqwest::Response; use reqwest_middleware::reqwest::multipart::Part; @@ -61,11 +61,11 @@ impl RunParams { /// Backend trait for OCR providers. /// /// Implementations send image bytes to an OCR service and return -/// typed [`ImageRegion`] results with word-level bounding boxes. +/// hierarchical [`ImageOutput`] results with page/block/line/word structure. /// /// Confidence values **must** be normalised to 0.0..=1.0 before -/// populating [`ImageRegion::confidence`]. Backends whose upstream -/// API uses a different scale (e.g. AWS Textract returns 0–100) are +/// populating [`Word::confidence`]. Backends whose upstream API uses +/// a different scale (e.g. AWS Textract returns 0–100) are /// responsible for converting. #[async_trait::async_trait] pub trait Backend: Send + Sync + 'static { diff --git a/crates/nvisy-ocr/src/backend/output.rs b/crates/nvisy-ocr/src/backend/output.rs index 373d3dbf..45edaaa4 100644 --- a/crates/nvisy-ocr/src/backend/output.rs +++ b/crates/nvisy-ocr/src/backend/output.rs @@ -3,28 +3,11 @@ use nvisy_core::math::{BoundingBox, Polygon}; use nvisy_core::path::ContentSource; use serde::{Deserialize, Serialize}; -use strum::{Display, EnumString}; -/// Hierarchical level of a text region within a document page. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(Display, EnumString, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -pub enum TextLevel { - /// Full page. - Page, - /// Block-level region (paragraph, table, figure). - Block, - /// Single line of text. - Line, - /// Individual word. - Word, -} - -/// A single text region detected by an OCR backend. +/// A single word detected by OCR. #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ImageRegion { - /// Extracted text content. +pub struct Word { + /// Recognised text content. pub text: String, /// Confidence score (0.0..=1.0), if the backend provides one. pub confidence: Option, @@ -32,36 +15,72 @@ pub struct ImageRegion { pub bbox: BoundingBox, /// Polygon vertices for rotated or skewed text regions. pub polygon: Option, - /// Hierarchical level of this text region: word, line, block, etc. - pub level: Option, } -impl ImageRegion { - /// Returns `true` if the extracted text is empty. - pub fn is_empty(&self) -> bool { - self.text.is_empty() - } +/// A line of text: ordered sequence of words. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Line { + /// Concatenated text from all words in this line. + pub text: String, + /// Line-level confidence, if the provider gives one. + pub confidence: Option, + /// Axis-aligned bounding box enclosing the line. + pub bbox: BoundingBox, + /// Polygon vertices for the line region. + pub polygon: Option, + /// Words in reading order. + pub words: Vec, +} - /// Length of the extracted text in bytes. - pub fn text_len(&self) -> usize { - self.text.len() - } +/// Classification of a block region. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BlockKind { + /// Paragraph / prose. + Text, + /// Tabular content. + Table, + /// Figure / chart. + Figure, + /// Unclassified. + Other, +} - /// Area of the bounding box: width × height. - pub fn area(&self) -> f64 { - self.bbox.width * self.bbox.height - } +/// A block (paragraph, table cell, figure caption, etc.). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Block { + /// Concatenated text from all lines in this block. + pub text: String, + /// Block-level confidence, if available. + pub confidence: Option, + /// Axis-aligned bounding box enclosing the block. + pub bbox: BoundingBox, + /// Polygon vertices for the block region. + pub polygon: Option, + /// Classification of this block. + pub kind: BlockKind, + /// Lines in reading order. + pub lines: Vec, +} - /// Returns `true` if the confidence meets or exceeds the given threshold. - pub fn meets_threshold(&self, threshold: f64) -> bool { - self.confidence.unwrap_or(0.0) >= threshold - } +/// A single page of OCR results. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Page { + /// 1-based page number. + pub page_number: u32, + /// Page width in pixels, when known. + pub width: Option, + /// Page height in pixels, when known. + pub height: Option, + /// Blocks in reading order. + pub blocks: Vec, } -/// Output from an OCR run on a single image. +/// Complete OCR output for one image/document. /// -/// Groups detected [`ImageRegion`]s together with a [`ContentSource`] -/// derived from the input image for provenance tracking. +/// Groups detected text into a hierarchical tree of +/// [`Page`] → [`Block`] → [`Line`] → [`Word`], together with a +/// [`ContentSource`] derived from the input image for provenance tracking. /// /// [`ContentSource`]: nvisy_core::path::ContentSource #[derive(Debug, Clone, Serialize, Deserialize)] @@ -70,8 +89,8 @@ pub struct ImageOutput { /// /// [`ContentSource`]: nvisy_core::path::ContentSource pub source: ContentSource, - /// Text regions detected in the image. - pub regions: Vec, + /// Pages of OCR results. + pub pages: Vec, } impl ImageOutput { @@ -79,72 +98,59 @@ impl ImageOutput { pub fn new(source: ContentSource) -> Self { Self { source, - regions: Vec::new(), + pages: Vec::new(), } } - /// Insert a region into this output. - pub fn insert(&mut self, region: ImageRegion) { - self.regions.push(region); - } - - /// Number of detected regions. + /// Number of pages. pub fn len(&self) -> usize { - self.regions.len() + self.pages.len() } - /// Returns `true` if no regions were detected. + /// Returns `true` if no pages or no words were detected. pub fn is_empty(&self) -> bool { - self.regions.is_empty() - } - - /// Iterator over the detected regions. - pub fn iter(&self) -> std::slice::Iter<'_, ImageRegion> { - self.regions.iter() - } - - /// Mutable iterator over the detected regions. - pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, ImageRegion> { - self.regions.iter_mut() + self.pages.is_empty() || self.words().next().is_none() } - /// Retain only regions that satisfy the predicate. - pub fn retain(&mut self, f: impl FnMut(&ImageRegion) -> bool) { - self.regions.retain(f); + /// Flat iterator over all words across all pages/blocks/lines. + pub fn words(&self) -> impl Iterator { + self.pages + .iter() + .flat_map(|p| &p.blocks) + .flat_map(|b| &b.lines) + .flat_map(|l| &l.words) } - /// Filter regions that meet the given confidence threshold. - pub fn above_threshold(&self, threshold: f64) -> Vec<&ImageRegion> { - self.regions + /// Flat iterator over all lines. + pub fn lines(&self) -> impl Iterator { + self.pages .iter() - .filter(|r| r.meets_threshold(threshold)) - .collect() + .flat_map(|p| &p.blocks) + .flat_map(|b| &b.lines) } -} -impl<'a> IntoIterator for &'a ImageOutput { - type IntoIter = std::slice::Iter<'a, ImageRegion>; - type Item = &'a ImageRegion; - - fn into_iter(self) -> Self::IntoIter { - self.iter() + /// Flat iterator over all blocks. + pub fn blocks(&self) -> impl Iterator { + self.pages.iter().flat_map(|p| &p.blocks) } -} -impl<'a> IntoIterator for &'a mut ImageOutput { - type IntoIter = std::slice::IterMut<'a, ImageRegion>; - type Item = &'a mut ImageRegion; - - fn into_iter(self) -> Self::IntoIter { - self.iter_mut() + /// Full extracted text (pages joined by `\n\n`). + pub fn full_text(&self) -> String { + self.pages + .iter() + .map(|p| { + p.blocks + .iter() + .map(|b| b.text.as_str()) + .collect::>() + .join("\n") + }) + .collect::>() + .join("\n\n") } -} - -impl IntoIterator for ImageOutput { - type IntoIter = std::vec::IntoIter; - type Item = ImageRegion; - fn into_iter(self) -> Self::IntoIter { - self.regions.into_iter() + /// Total word count across all pages. + pub fn word_count(&self) -> usize { + self.words().count() } } diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs index 21547b19..45dda24e 100644 --- a/crates/nvisy-ocr/src/engine/mod.rs +++ b/crates/nvisy-ocr/src/engine/mod.rs @@ -28,7 +28,7 @@ use crate::backend::{Backend, ImageInput, ImageOutput, RunParams}; /// /// let image = ImageInput::new(png_bytes, ImageFormat::Png); /// let output = engine.run(&image, &RunParams::default()).await?; -/// println!("{} regions detected", output.len()); +/// println!("{} words detected", output.word_count()); /// ``` #[derive(Clone)] pub struct OcrEngine { @@ -57,7 +57,7 @@ impl OcrEngine { ))] pub async fn run(&self, image: &ImageInput, params: &RunParams) -> Result { let output = self.backend.run(image, params).await?; - tracing::debug!(regions = output.len(), "ocr complete"); + tracing::debug!(words = output.word_count(), "ocr complete"); Ok(output) } @@ -69,8 +69,8 @@ impl OcrEngine { params: &RunParams, ) -> Result, Error> { let outputs = self.backend.run_batch(images, params).await?; - let regions: usize = outputs.iter().map(|o| o.len()).sum(); - tracing::debug!(regions, "batch ocr complete"); + let words: usize = outputs.iter().map(|o| o.word_count()).sum(); + tracing::debug!(words, "batch ocr complete"); Ok(outputs) } } diff --git a/crates/nvisy-ocr/src/lib.rs b/crates/nvisy-ocr/src/lib.rs index 376c21da..ae0dfbaf 100644 --- a/crates/nvisy-ocr/src/lib.rs +++ b/crates/nvisy-ocr/src/lib.rs @@ -10,6 +10,6 @@ pub mod provider; pub mod prelude; pub use backend::{ - Backend, ImageFormat, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, + Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word, }; pub use engine::{OcrEngine, OcrProvider}; diff --git a/crates/nvisy-ocr/src/prelude.rs b/crates/nvisy-ocr/src/prelude.rs index 342b0e9b..498466ee 100644 --- a/crates/nvisy-ocr/src/prelude.rs +++ b/crates/nvisy-ocr/src/prelude.rs @@ -1,5 +1,7 @@ //! Convenience re-exports. -pub use crate::backend::{Backend, ImageFormat, ImageInput, ImageOutput, ImageRegion, RunParams}; +pub use crate::backend::{ + Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word, +}; pub use crate::engine::{OcrEngine, OcrProvider}; pub use crate::provider::*; diff --git a/crates/nvisy-ocr/src/provider/aws_textract/backend.rs b/crates/nvisy-ocr/src/provider/aws_textract/backend.rs index 2a113a56..9dcceda7 100644 --- a/crates/nvisy-ocr/src/provider/aws_textract/backend.rs +++ b/crates/nvisy-ocr/src/provider/aws_textract/backend.rs @@ -2,6 +2,7 @@ //! //! [`Backend`]: crate::Backend +use std::collections::HashMap; use std::fmt; use hmac::{Hmac, Mac}; @@ -13,7 +14,8 @@ use sha2::{Digest, Sha256}; use super::AwsTextractParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, }; /// [`Backend`] implementation for AWS Textract. @@ -124,9 +126,18 @@ struct TextractResponse { #[serde(rename_all = "PascalCase")] struct TextractBlock { block_type: String, + id: Option, text: Option, confidence: Option, geometry: Option, + relationships: Option>, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct TextractRelationship { + r#type: String, + ids: Vec, } #[derive(Debug, Deserialize)] @@ -152,6 +163,30 @@ struct TextractPoint { y: f64, } +fn extract_geometry(geom: Option<&TextractGeometry>) -> (BoundingBox, Option) { + match geom { + Some(geom) => { + let bbox = geom + .bounding_box + .as_ref() + .map(|b| BoundingBox { + x: b.left, + y: b.top, + width: b.width, + height: b.height, + }) + .unwrap_or_default(); + + let polygon = geom.polygon.as_ref().map(|pts| Polygon { + vertices: pts.iter().map(|p| Vertex::new(p.x, p.y)).collect(), + }); + + (bbox, polygon) + } + None => (BoundingBox::default(), None), + } +} + #[async_trait::async_trait] impl Backend for AwsTextractBackend { async fn run(&self, image: &ImageInput, params: &RunParams) -> Result { @@ -220,53 +255,116 @@ impl Backend for AwsTextractBackend { })?; let threshold = params.confidence_threshold; + + // Index blocks by ID for relationship lookups. + let block_map: HashMap<&str, &TextractBlock> = parsed + .blocks + .iter() + .filter_map(|b| b.id.as_deref().map(|id| (id, b))) + .collect(); + + fn child_ids(block: &TextractBlock) -> Vec<&str> { + block + .relationships + .as_ref() + .into_iter() + .flatten() + .filter(|r| r.r#type == "CHILD") + .flat_map(|r| r.ids.iter().map(|s| s.as_str())) + .collect() + } + let mut output = ImageOutput::new(image.source.derive()); + // Iterate PAGE blocks; build LINE→WORD tree from relationships. + let mut page_number = 0u32; for block in &parsed.blocks { - if block.block_type != "WORD" { + if block.block_type != "PAGE" { continue; } + page_number += 1; + + let line_ids = child_ids(block); + let mut lines = Vec::new(); + + for line_id in &line_ids { + let line_block = match block_map.get(line_id) { + Some(b) if b.block_type == "LINE" => b, + _ => continue, + }; + + let word_ids = child_ids(line_block); + let mut words = Vec::new(); + + for word_id in &word_ids { + let word_block = match block_map.get(word_id) { + Some(b) if b.block_type == "WORD" => b, + _ => continue, + }; + + let text = match &word_block.text { + Some(t) => t.clone(), + None => continue, + }; + + // Textract returns confidence as 0–100; normalise to 0–1. + let confidence = word_block.confidence.unwrap_or(0.0) / 100.0; + if confidence < threshold { + continue; + } - let text = match &block.text { - Some(t) => t.clone(), - None => continue, - }; - - // Textract returns confidence as 0–100; normalise to 0–1. - let confidence = block.confidence.unwrap_or(0.0) / 100.0; - - if confidence < threshold { - continue; - } + let (bbox, polygon) = extract_geometry(word_block.geometry.as_ref()); - let (bbox, polygon) = match &block.geometry { - Some(geom) => { - let bbox = geom - .bounding_box - .as_ref() - .map(|b| BoundingBox { - x: b.left, - y: b.top, - width: b.width, - height: b.height, - }) - .unwrap_or_default(); - - let polygon = geom.polygon.as_ref().map(|pts| Polygon { - vertices: pts.iter().map(|p| Vertex::new(p.x, p.y)).collect(), + words.push(Word { + text, + confidence: Some(confidence), + bbox, + polygon, }); + } - (bbox, polygon) + if words.is_empty() { + continue; } - None => (BoundingBox::default(), None), - }; - - output.insert(ImageRegion { - text, - confidence: Some(confidence), - bbox, - polygon, - level: Some(TextLevel::Word), + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_confidence = + line_block.confidence.map(|c| c / 100.0); + let (line_bbox, line_polygon) = + extract_geometry(line_block.geometry.as_ref()); + + lines.push(Line { + text: line_text, + confidence: line_confidence, + bbox: line_bbox, + polygon: line_polygon, + words, + }); + } + + let block_text = lines + .iter() + .map(|l| l.text.as_str()) + .collect::>() + .join("\n"); + let (page_bbox, _) = extract_geometry(block.geometry.as_ref()); + + output.pages.push(Page { + page_number, + width: Some(page_bbox.width), + height: Some(page_bbox.height), + blocks: vec![Block { + text: block_text, + confidence: None, + bbox: page_bbox, + polygon: None, + kind: BlockKind::Text, + lines, + }], }); } @@ -349,6 +447,70 @@ mod tests { assert!((bbox.width - 0.2).abs() < 0.001); } + #[test] + fn build_hierarchy_from_relationships() { + let json = serde_json::json!({ + "Blocks": [ + { + "BlockType": "PAGE", + "Id": "page-1", + "Geometry": { + "BoundingBox": { "Width": 1.0, "Height": 1.0, "Left": 0.0, "Top": 0.0 } + }, + "Relationships": [{ + "Type": "CHILD", + "Ids": ["line-1"] + }] + }, + { + "BlockType": "LINE", + "Id": "line-1", + "Text": "hello world", + "Confidence": 98.0, + "Geometry": { + "BoundingBox": { "Width": 0.5, "Height": 0.05, "Left": 0.1, "Top": 0.3 } + }, + "Relationships": [{ + "Type": "CHILD", + "Ids": ["word-1", "word-2"] + }] + }, + { + "BlockType": "WORD", + "Id": "word-1", + "Text": "hello", + "Confidence": 99.0, + "Geometry": { + "BoundingBox": { "Width": 0.2, "Height": 0.05, "Left": 0.1, "Top": 0.3 } + } + }, + { + "BlockType": "WORD", + "Id": "word-2", + "Text": "world", + "Confidence": 97.0, + "Geometry": { + "BoundingBox": { "Width": 0.2, "Height": 0.05, "Left": 0.35, "Top": 0.3 } + } + } + ] + }); + + let resp: TextractResponse = serde_json::from_value(json).unwrap(); + assert_eq!(resp.blocks.len(), 4); + + // Verify the relationship structure. + let page = &resp.blocks[0]; + assert_eq!(page.block_type, "PAGE"); + let rels = page.relationships.as_ref().unwrap(); + assert_eq!(rels[0].ids, vec!["line-1"]); + + let line = &resp.blocks[1]; + assert_eq!(line.block_type, "LINE"); + let rels = line.relationships.as_ref().unwrap(); + assert_eq!(rels[0].ids, vec!["word-1", "word-2"]); + } + #[test] fn format_datetime_known_epoch() { // 2024-01-15T11:30:45Z = 1705318245 seconds since epoch diff --git a/crates/nvisy-ocr/src/provider/azure_docai/backend.rs b/crates/nvisy-ocr/src/provider/azure_docai/backend.rs index 5abdaab7..33579361 100644 --- a/crates/nvisy-ocr/src/provider/azure_docai/backend.rs +++ b/crates/nvisy-ocr/src/provider/azure_docai/backend.rs @@ -5,14 +5,15 @@ use std::fmt; use nvisy_core::Error; -use nvisy_core::math::{Polygon, Vertex}; +use nvisy_core::math::{BoundingBox, Polygon, Vertex}; use nvisy_http::HttpClient; use serde::Deserialize; use tokio::time::{Duration, sleep}; use super::AzureDocaiParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, }; /// [`Backend`] implementation for Azure Document Intelligence. @@ -193,8 +194,10 @@ impl Backend for AzureDocaiBackend { None => return Ok(output), }; - for page in &result.pages { - for word in &page.words { + for (page_idx, azure_page) in result.pages.iter().enumerate() { + let mut words = Vec::new(); + + for word in &azure_page.words { if word.confidence < threshold { continue; } @@ -216,14 +219,45 @@ impl Backend for AzureDocaiBackend { .map(|p| p.bounding_box()) .unwrap_or_default(); - output.insert(ImageRegion { + words.push(Word { text: word.content.clone(), confidence: Some(word.confidence), bbox, polygon, - level: Some(TextLevel::Word), }); } + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_bbox = + BoundingBox::enclosing(words.iter().map(|w| &w.bbox)); + + let line = Line { + text: line_text.clone(), + confidence: None, + bbox: line_bbox, + polygon: None, + words, + }; + + let block = Block { + text: line_text, + confidence: None, + bbox: line_bbox, + polygon: None, + kind: BlockKind::Text, + lines: vec![line], + }; + + output.pages.push(Page { + page_number: (page_idx + 1) as u32, + width: None, + height: None, + blocks: vec![block], + }); } Ok(output) diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs index 2fd8e5b6..497cf94e 100644 --- a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs +++ b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs @@ -10,17 +10,17 @@ use serde::Deserialize; use super::SuryaParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, image_part, }; /// [`Backend`] implementation for Surya OCR. /// /// Sends images as multipart form data to `{base_url}/ocr` and parses -/// word-level results into [`ImageRegion`]. Surya returns both a 4-point -/// polygon and an axis-aligned bounding box in pixel coordinates. +/// the response into a hierarchical page/block/line/word tree. +/// Surya's `TextLine` maps directly to [`Line`]. /// /// [`Backend`]: crate::Backend -/// [`ImageRegion`]: crate::ImageRegxion #[derive(Debug)] pub struct SuryaBackend { client: HttpClient, @@ -94,9 +94,13 @@ impl Backend for SuryaBackend { let threshold = params.confidence_threshold; let mut output = ImageOutput::new(image.source.derive()); - for page in &parsed.pages { - for line in &page.text_lines { - for word in &line.words { + for (page_idx, surya_page) in parsed.pages.iter().enumerate() { + let mut lines = Vec::new(); + + for text_line in &surya_page.text_lines { + let mut words = Vec::new(); + + for word in &text_line.words { if word.confidence < threshold { continue; } @@ -118,15 +122,56 @@ impl Backend for SuryaBackend { .collect(), }; - output.insert(ImageRegion { + words.push(Word { text: word.text.clone(), confidence: Some(word.confidence), bbox, polygon: Some(polygon), - level: Some(TextLevel::Word), }); } + + if words.is_empty() { + continue; + } + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_bbox = + BoundingBox::enclosing(words.iter().map(|w| &w.bbox)); + + lines.push(Line { + text: line_text, + confidence: None, + bbox: line_bbox, + polygon: None, + words, + }); } + + let block_text = lines + .iter() + .map(|l| l.text.as_str()) + .collect::>() + .join("\n"); + let block_bbox = + BoundingBox::enclosing(lines.iter().map(|l| &l.bbox)); + + output.pages.push(Page { + page_number: (page_idx + 1) as u32, + width: None, + height: None, + blocks: vec![Block { + text: block_text, + confidence: None, + bbox: block_bbox, + polygon: None, + kind: BlockKind::Text, + lines, + }], + }); } Ok(output) diff --git a/crates/nvisy-ocr/src/provider/google_vision/backend.rs b/crates/nvisy-ocr/src/provider/google_vision/backend.rs index 41a58ce7..b77817ec 100644 --- a/crates/nvisy-ocr/src/provider/google_vision/backend.rs +++ b/crates/nvisy-ocr/src/provider/google_vision/backend.rs @@ -5,19 +5,21 @@ use std::fmt; use nvisy_core::Error; -use nvisy_core::math::{Polygon, Vertex}; +use nvisy_core::math::{BoundingBox, Polygon, Vertex}; use nvisy_http::HttpClient; use serde::Deserialize; use super::GoogleVisionParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, }; /// [`Backend`] implementation for Google Cloud Vision API. /// /// Sends images as base64-encoded JSON to the `images:annotate` endpoint -/// and parses word-level results from the `fullTextAnnotation` response. +/// and parses the `fullTextAnnotation` response into a hierarchical +/// page/block/line/word tree. /// /// [`Backend`]: crate::Backend pub struct GoogleVisionBackend { @@ -104,6 +106,32 @@ struct GvVertex { y: Option, } +fn gv_polygon(bp: &GvBoundingPoly) -> Polygon { + Polygon { + vertices: bp + .vertices + .iter() + .map(|v| { + Vertex::new( + f64::from(v.x.unwrap_or(0)), + f64::from(v.y.unwrap_or(0)), + ) + }) + .collect(), + } +} + +fn gv_bbox_polygon(bp: Option<&GvBoundingPoly>) -> (BoundingBox, Option) { + match bp { + Some(bp) => { + let polygon = gv_polygon(bp); + let bbox = polygon.bounding_box(); + (bbox, Some(polygon)) + } + None => (BoundingBox::default(), None), + } +} + #[async_trait::async_trait] impl Backend for GoogleVisionBackend { async fn run(&self, image: &ImageInput, params: &RunParams) -> Result { @@ -142,51 +170,92 @@ impl Backend for GoogleVisionBackend { let threshold = params.confidence_threshold; let mut output = ImageOutput::new(image.source.derive()); - for result in &parsed.responses { + for (result_idx, result) in parsed.responses.iter().enumerate() { let annotation = match &result.full_text_annotation { Some(a) => a, None => continue, }; - for page in &annotation.pages { - for block in &page.blocks { - for paragraph in &block.paragraphs { - for word in ¶graph.words { - if word.confidence < threshold { + for (page_idx, gv_page) in annotation.pages.iter().enumerate() { + let mut blocks = Vec::new(); + + for gv_block in &gv_page.blocks { + let mut lines = Vec::new(); + + // Each GV paragraph maps to a Line. + for paragraph in &gv_block.paragraphs { + let mut words = Vec::new(); + + for gv_word in ¶graph.words { + if gv_word.confidence < threshold { continue; } let text: String = - word.symbols.iter().map(|s| s.text.as_str()).collect(); - - let polygon = word.bounding_box.as_ref().map(|bp| Polygon { - vertices: bp - .vertices - .iter() - .map(|v| { - Vertex::new( - f64::from(v.x.unwrap_or(0)), - f64::from(v.y.unwrap_or(0)), - ) - }) - .collect(), - }); + gv_word.symbols.iter().map(|s| s.text.as_str()).collect(); - let bbox = polygon - .as_ref() - .map(|p| p.bounding_box()) - .unwrap_or_default(); + let (bbox, polygon) = + gv_bbox_polygon(gv_word.bounding_box.as_ref()); - output.insert(ImageRegion { + words.push(Word { text, - confidence: Some(word.confidence), + confidence: Some(gv_word.confidence), bbox, polygon, - level: Some(TextLevel::Word), }); } + + if words.is_empty() { + continue; + } + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_bbox = BoundingBox::enclosing( + words.iter().map(|w| &w.bbox), + ); + + lines.push(Line { + text: line_text, + confidence: None, + bbox: line_bbox, + polygon: None, + words, + }); } + + if lines.is_empty() { + continue; + } + + let block_text = lines + .iter() + .map(|l| l.text.as_str()) + .collect::>() + .join("\n"); + let block_bbox = BoundingBox::enclosing( + lines.iter().map(|l| &l.bbox), + ); + + blocks.push(Block { + text: block_text, + confidence: None, + bbox: block_bbox, + polygon: None, + kind: BlockKind::Text, + lines, + }); } + + output.pages.push(Page { + page_number: (result_idx + page_idx + 1) as u32, + width: None, + height: None, + blocks, + }); } } diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs index e5562b9f..14a85203 100644 --- a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs +++ b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs @@ -3,25 +3,24 @@ //! [`Backend`]: crate::Backend use nvisy_core::Error; -use nvisy_core::math::{Polygon, Vertex}; +use nvisy_core::math::{BoundingBox, Polygon, Vertex}; use nvisy_http::HttpClient; use reqwest_middleware::reqwest::multipart::Form; use serde::Deserialize; use super::DoctrParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, image_part, }; /// [`Backend`] implementation for DocTR. /// /// Sends images as multipart form data to `{base_url}/ocr` and parses -/// word-level results into [`ImageRegion`]. DocTR returns normalised 0..1 -/// coordinates that are denormalised using the `dimensions` field from -/// the response. +/// word-level results into a hierarchical tree. DocTR returns normalised +/// 0..1 coordinates that are denormalised using the `dimensions` field. /// /// [`Backend`]: crate::Backend -/// [`ImageRegion`]: crate::ImageRegion #[derive(Debug)] pub struct DoctrBackend { client: HttpClient, @@ -90,8 +89,9 @@ impl Backend for DoctrBackend { let threshold = params.confidence_threshold; let mut output = ImageOutput::new(image.source.derive()); - for page in &parsed.pages { + for (page_idx, page) in parsed.pages.iter().enumerate() { let [height, width] = page.dimensions; + let mut words = Vec::new(); for word in &page.words { if word.confidence < threshold { @@ -115,14 +115,45 @@ impl Backend for DoctrBackend { }; let bbox = polygon.bounding_box(); - output.insert(ImageRegion { + words.push(Word { text: word.value.clone(), confidence: Some(word.confidence), bbox, polygon: Some(polygon), - level: Some(TextLevel::Word), }); } + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_bbox = + BoundingBox::enclosing(words.iter().map(|w| &w.bbox)); + + let line = Line { + text: line_text.clone(), + confidence: None, + bbox: line_bbox, + polygon: None, + words, + }; + + let block = Block { + text: line_text, + confidence: None, + bbox: line_bbox, + polygon: None, + kind: BlockKind::Text, + lines: vec![line], + }; + + output.pages.push(Page { + page_number: (page_idx + 1) as u32, + width: Some(width), + height: Some(height), + blocks: vec![block], + }); } Ok(output) diff --git a/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs b/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs index 2dd8236e..5ca411ac 100644 --- a/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs +++ b/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs @@ -3,23 +3,24 @@ //! [`Backend`]: crate::Backend use nvisy_core::Error; -use nvisy_core::math::{Polygon, Vertex}; +use nvisy_core::math::{BoundingBox, Polygon, Vertex}; use nvisy_http::HttpClient; use reqwest_middleware::reqwest::multipart::Form; use serde::Deserialize; use super::PaddleXParams; use crate::backend::{ - Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part, + Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, + check_response, image_part, }; /// [`Backend`] implementation for PaddleX PP-OCRv5. /// /// Sends images as multipart form data to `{base_url}/ocr` with -/// `returnWordBox=true` and parses word-level results into [`ImageRegion`]. +/// `returnWordBox=true` and parses word-level results into a +/// hierarchical page/block/line/word tree. /// /// [`Backend`]: crate::Backend -/// [`ImageRegion`]: crate::ImageRegion #[derive(Debug)] pub struct PaddleXBackend { client: HttpClient, @@ -99,7 +100,11 @@ impl Backend for PaddleXBackend { let threshold = params.confidence_threshold; let mut output = ImageOutput::new(image.source.derive()); + let mut lines = Vec::new(); + for ocr_result in &parsed.result.ocr_results { + let mut words = Vec::new(); + for word in &ocr_result.word_results { if word.confidence < threshold { continue; @@ -114,16 +119,57 @@ impl Backend for PaddleXBackend { }; let bbox = polygon.bounding_box(); - output.insert(ImageRegion { + words.push(Word { text: word.text.clone(), confidence: Some(word.confidence), bbox, polygon: Some(polygon), - level: Some(TextLevel::Word), }); } + + if words.is_empty() { + continue; + } + + let line_text = words + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + let line_bbox = + BoundingBox::enclosing(words.iter().map(|w| &w.bbox)); + + lines.push(Line { + text: line_text, + confidence: None, + bbox: line_bbox, + polygon: None, + words, + }); } + let block_text = lines + .iter() + .map(|l| l.text.as_str()) + .collect::>() + .join("\n"); + let block_bbox = + BoundingBox::enclosing(lines.iter().map(|l| &l.bbox)); + + output.pages.push(Page { + page_number: 1, + width: None, + height: None, + blocks: vec![Block { + text: block_text, + confidence: None, + bbox: block_bbox, + polygon: None, + kind: BlockKind::Text, + lines, + }], + }); + Ok(output) } } From fc45db3c9bff2a09472a86e0cac88cd70371384f Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 8 Mar 2026 07:51:44 +0100 Subject: [PATCH 2/8] style(core): format BoundingBox derive attributes Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-core/src/math/bounding_box.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs index 29ec65a5..722f96da 100644 --- a/crates/nvisy-core/src/math/bounding_box.rs +++ b/crates/nvisy-core/src/math/bounding_box.rs @@ -8,16 +8,8 @@ use serde::{Deserialize, Serialize}; /// Coordinates are `f64` to support both pixel and normalized (0.0–1.0) /// values from detection models. Use [`BoundingBoxU32`] (or [`Into`]) /// when integer pixel coordinates are needed for rendering. -#[derive( - Debug, - Clone, - Copy, - Default, - PartialEq, - Serialize, - Deserialize, - JsonSchema -)] +#[derive(Debug, Clone, Copy, Default, PartialEq)] +#[derive(Serialize, Deserialize, JsonSchema)] pub struct BoundingBox { /// Horizontal offset of the top-left corner (pixels or normalized). pub x: f64, From c107dac8f528a7e3e56669e65548da2ec0fe43ed Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 8 Mar 2026 07:59:07 +0100 Subject: [PATCH 3/8] refactor(ocr): remove DocTR provider Delete the mindee_doctr backend, params, and module. Update OcrProvider enum, doc examples, and README to reflect five remaining providers. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-ocr/README.md | 9 +- crates/nvisy-ocr/src/engine/mod.rs | 4 +- crates/nvisy-ocr/src/engine/params.rs | 8 +- .../src/provider/mindee_doctr/backend.rs | 220 ------------------ .../src/provider/mindee_doctr/mod.rs | 10 - .../src/provider/mindee_doctr/params.rs | 10 - crates/nvisy-ocr/src/provider/mod.rs | 2 - 7 files changed, 7 insertions(+), 256 deletions(-) delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/params.rs diff --git a/crates/nvisy-ocr/README.md b/crates/nvisy-ocr/README.md index 9dcd0459..6ecfe65d 100644 --- a/crates/nvisy-ocr/README.md +++ b/crates/nvisy-ocr/README.md @@ -4,11 +4,10 @@ OCR backend trait, type-erased engine, and provider implementations for the Nvisy runtime. -Defines the `Backend` trait for text extraction from images and ships six +Defines the `Backend` trait for text extraction from images and ships five provider implementations across local and cloud services: **Local** (always available): -- `DoctrBackend`: DocTR server (multipart upload, normalised coordinates) - `PaddleXBackend`: PaddleX PP-OCRv5 server (multipart upload, word-level boxes) - `SuryaBackend`: Surya OCR server (multipart upload, pixel coordinates) @@ -17,9 +16,9 @@ provider implementations across local and cloud services: - `GoogleVisionBackend`: Google Cloud Vision API (`google-vision` feature) - `AzureDocaiBackend`: Azure Document Intelligence with async polling (`azure-docai` feature) -Every backend returns `ImageOutput` containing a list of `ImageRegion`s, -each with extracted text, optional confidence score, bounding box, polygon -vertices for rotated text, and hierarchical text-level annotations. +Every backend returns `ImageOutput` containing a hierarchical tree of +`Page` → `Block` → `Line` → `Word`, each with extracted text, optional +confidence score, bounding box, and polygon vertices for rotated text. The `Engine` wrapper provides a type-erased entry point with built-in `tracing` instrumentation for request-level observability. diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs index 45dda24e..6a6c8024 100644 --- a/crates/nvisy-ocr/src/engine/mod.rs +++ b/crates/nvisy-ocr/src/engine/mod.rs @@ -21,9 +21,9 @@ use crate::backend::{Backend, ImageInput, ImageOutput, RunParams}; /// /// ```ignore /// use nvisy_ocr::{OcrEngine, ImageInput, ImageFormat, RunParams}; -/// use nvisy_ocr::provider::{DoctrBackend, DoctrParams}; +/// use nvisy_ocr::provider::{SuryaBackend, SuryaParams}; /// -/// let backend = DoctrBackend::new(DoctrParams { base_url: "http://localhost:8000".into() }); +/// let backend = SuryaBackend::new(SuryaParams { base_url: "http://localhost:8000".into() }); /// let engine = OcrEngine::new(backend); /// /// let image = ImageInput::new(png_bytes, ImageFormat::Png); diff --git a/crates/nvisy-ocr/src/engine/params.rs b/crates/nvisy-ocr/src/engine/params.rs index 61f92e98..b9fcd0d5 100644 --- a/crates/nvisy-ocr/src/engine/params.rs +++ b/crates/nvisy-ocr/src/engine/params.rs @@ -5,9 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::provider::{AwsTextractBackend, AwsTextractParams}; #[cfg(feature = "azure-docai")] use crate::provider::{AzureDocaiBackend, AzureDocaiParams}; -use crate::provider::{ - DoctrBackend, DoctrParams, PaddleXBackend, PaddleXParams, SuryaBackend, SuryaParams, -}; +use crate::provider::{PaddleXBackend, PaddleXParams, SuryaBackend, SuryaParams}; #[cfg(feature = "google-vision")] use crate::provider::{GoogleVisionBackend, GoogleVisionParams}; @@ -21,8 +19,6 @@ use crate::provider::{GoogleVisionBackend, GoogleVisionParams}; pub enum OcrProvider { /// Datalab Surya OCR. Surya(SuryaParams), - /// Mindee DocTR. - Doctr(DoctrParams), /// PaddlePaddle PaddleX PP-OCRv5. PaddleX(PaddleXParams), /// AWS Textract. @@ -44,7 +40,6 @@ impl OcrProvider { pub fn into_engine(self) -> super::OcrEngine { match self { Self::Surya(p) => super::OcrEngine::new(SuryaBackend::new(p)), - Self::Doctr(p) => super::OcrEngine::new(DoctrBackend::new(p)), Self::PaddleX(p) => super::OcrEngine::new(PaddleXBackend::new(p)), #[cfg(feature = "aws-textract")] Self::AwsTextract(p) => super::OcrEngine::new(AwsTextractBackend::new(p)), @@ -62,7 +57,6 @@ impl OcrProvider { pub fn into_engine_with_client(self, client: HttpClient) -> super::OcrEngine { match self { Self::Surya(p) => super::OcrEngine::new(SuryaBackend::with_client(client, p)), - Self::Doctr(p) => super::OcrEngine::new(DoctrBackend::with_client(client, p)), Self::PaddleX(p) => super::OcrEngine::new(PaddleXBackend::with_client(client, p)), #[cfg(feature = "aws-textract")] Self::AwsTextract(p) => { diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs deleted file mode 100644 index 14a85203..00000000 --- a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs +++ /dev/null @@ -1,220 +0,0 @@ -//! [`Backend`] implementation for DocTR. -//! -//! [`Backend`]: crate::Backend - -use nvisy_core::Error; -use nvisy_core::math::{BoundingBox, Polygon, Vertex}; -use nvisy_http::HttpClient; -use reqwest_middleware::reqwest::multipart::Form; -use serde::Deserialize; - -use super::DoctrParams; -use crate::backend::{ - Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word, - check_response, image_part, -}; - -/// [`Backend`] implementation for DocTR. -/// -/// Sends images as multipart form data to `{base_url}/ocr` and parses -/// word-level results into a hierarchical tree. DocTR returns normalised -/// 0..1 coordinates that are denormalised using the `dimensions` field. -/// -/// [`Backend`]: crate::Backend -#[derive(Debug)] -pub struct DoctrBackend { - client: HttpClient, - base_url: String, -} - -impl DoctrBackend { - /// Create a new backend with default HTTP configuration. - pub fn new(params: DoctrParams) -> Self { - Self::with_client(HttpClient::default(), params) - } - - /// Create a new backend with a pre-configured HTTP client. - pub fn with_client(client: HttpClient, params: DoctrParams) -> Self { - Self { - client, - base_url: params.base_url, - } - } -} - -#[derive(Debug, Deserialize)] -struct DoctrResponse { - pages: Vec, -} - -#[derive(Debug, Deserialize)] -struct DoctrPage { - /// `[height, width]` in pixels. - dimensions: [f64; 2], - words: Vec, -} - -#[derive(Debug, Deserialize)] -struct DoctrWord { - value: String, - #[serde(default)] - confidence: f64, - /// `[[x_min, y_min], [x_max, y_max]]` in normalised 0–1 coords. - geometry: [[f64; 2]; 2], -} - -#[async_trait::async_trait] -impl Backend for DoctrBackend { - async fn run(&self, image: &ImageInput, params: &RunParams) -> Result { - let file_part = image_part(image)?; - - let form = Form::new().part("file", file_part); - - let url = format!("{}/ocr", self.base_url.trim_end_matches('/')); - - let resp = self - .client - .post(&url) - .multipart(form) - .send() - .await - .map_err(|e| Error::connection(e.to_string(), "doctr_ocr", true))?; - - let resp = check_response(resp, "DocTR").await?; - - let parsed: DoctrResponse = resp.json().await.map_err(|e| { - Error::runtime(format!("DocTR JSON parse error: {e}"), "doctr_ocr", false) - })?; - - let threshold = params.confidence_threshold; - let mut output = ImageOutput::new(image.source.derive()); - - for (page_idx, page) in parsed.pages.iter().enumerate() { - let [height, width] = page.dimensions; - let mut words = Vec::new(); - - for word in &page.words { - if word.confidence < threshold { - continue; - } - - let [[x_min_n, y_min_n], [x_max_n, y_max_n]] = word.geometry; - - let x_min = x_min_n * width; - let y_min = y_min_n * height; - let x_max = x_max_n * width; - let y_max = y_max_n * height; - - let polygon = Polygon { - vertices: vec![ - Vertex::new(x_min, y_min), // TL - Vertex::new(x_max, y_min), // TR - Vertex::new(x_max, y_max), // BR - Vertex::new(x_min, y_max), // BL - ], - }; - let bbox = polygon.bounding_box(); - - words.push(Word { - text: word.value.clone(), - confidence: Some(word.confidence), - bbox, - polygon: Some(polygon), - }); - } - - let line_text = words - .iter() - .map(|w| w.text.as_str()) - .collect::>() - .join(" "); - let line_bbox = - BoundingBox::enclosing(words.iter().map(|w| &w.bbox)); - - let line = Line { - text: line_text.clone(), - confidence: None, - bbox: line_bbox, - polygon: None, - words, - }; - - let block = Block { - text: line_text, - confidence: None, - bbox: line_bbox, - polygon: None, - kind: BlockKind::Text, - lines: vec![line], - }; - - output.pages.push(Page { - page_number: (page_idx + 1) as u32, - width: Some(width), - height: Some(height), - blocks: vec![block], - }); - } - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn parse_response() { - let json = serde_json::json!({ - "pages": [{ - "dimensions": [1000.0, 2000.0], - "words": [ - { - "value": "hello", - "confidence": 0.97, - "geometry": [[0.05, 0.10], [0.15, 0.14]] - }, - { - "value": "world", - "confidence": 0.95, - "geometry": [[0.20, 0.10], [0.30, 0.14]] - } - ] - }] - }); - - let resp: DoctrResponse = serde_json::from_value(json).unwrap(); - assert_eq!(resp.pages.len(), 1); - assert_eq!(resp.pages[0].words.len(), 2); - - let page = &resp.pages[0]; - let [height, width] = page.dimensions; - let word = &page.words[0]; - - // Denormalise: x_min = 0.05 * 2000 = 100, y_min = 0.10 * 1000 = 100 - let x_min = word.geometry[0][0] * width; - let y_min = word.geometry[0][1] * height; - let x_max = word.geometry[1][0] * width; - let y_max = word.geometry[1][1] * height; - - assert!((x_min - 100.0).abs() < 0.01); - assert!((y_min - 100.0).abs() < 0.01); - assert!((x_max - 300.0).abs() < 0.01); - assert!((y_max - 140.0).abs() < 0.01); - - let polygon = Polygon { - vertices: vec![ - Vertex::new(x_min, y_min), - Vertex::new(x_max, y_min), - Vertex::new(x_max, y_max), - Vertex::new(x_min, y_max), - ], - }; - let bbox = polygon.bounding_box(); - assert!((bbox.x - 100.0).abs() < 0.01); - assert!((bbox.y - 100.0).abs() < 0.01); - assert!((bbox.width - 200.0).abs() < 0.01); - assert!((bbox.height - 40.0).abs() < 0.01); - } -} diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs deleted file mode 100644 index 338f49dd..00000000 --- a/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! DocTR OCR backend. -//! -//! Sends images as multipart form data to a DocTR server and parses -//! word-level results with normalised-to-pixel coordinate conversion. - -mod backend; -mod params; - -pub use backend::DoctrBackend; -pub use params::DoctrParams; diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs deleted file mode 100644 index ec73c615..00000000 --- a/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs +++ /dev/null @@ -1,10 +0,0 @@ -use serde::{Deserialize, Serialize}; - -/// Constructor parameters for [`DoctrBackend`]. -/// -/// [`DoctrBackend`]: super::DoctrBackend -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DoctrParams { - /// Base URL of the DocTR server. - pub base_url: String, -} diff --git a/crates/nvisy-ocr/src/provider/mod.rs b/crates/nvisy-ocr/src/provider/mod.rs index dd24c590..bf4b7831 100644 --- a/crates/nvisy-ocr/src/provider/mod.rs +++ b/crates/nvisy-ocr/src/provider/mod.rs @@ -1,11 +1,9 @@ //! All OCR backend implementations and their parameter types. mod datalab_surya; -mod mindee_doctr; mod paddle_paddlex; pub use datalab_surya::{SuryaBackend, SuryaParams}; -pub use mindee_doctr::{DoctrBackend, DoctrParams}; pub use paddle_paddlex::{PaddleXBackend, PaddleXParams}; #[cfg(feature = "aws-textract")] From 43c9dd38fbb7c4ff7d8042e22b3513a6262dce41 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 8 Mar 2026 08:02:36 +0100 Subject: [PATCH 4/8] refactor(core): rename BoundingBoxU32 to BoundingBoxPixel Also renames to_u32() to to_pixel() for consistency. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-codec/src/transform/image/ops.rs | 14 +++++++------- .../src/transform/image/transform.rs | 2 +- crates/nvisy-core/src/math/bounding_box.rs | 18 +++++++++--------- crates/nvisy-core/src/math/mod.rs | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/nvisy-codec/src/transform/image/ops.rs b/crates/nvisy-codec/src/transform/image/ops.rs index 47547213..89dbb22e 100644 --- a/crates/nvisy-codec/src/transform/image/ops.rs +++ b/crates/nvisy-codec/src/transform/image/ops.rs @@ -6,22 +6,22 @@ use image::DynamicImage; use image::imageops::FilterType; use imageproc::filter::gaussian_blur_f32; -use nvisy_core::math::BoundingBoxU32; +use nvisy_core::math::BoundingBoxPixel; /// Mutating image-transform operations on individual bounding-box regions. pub trait ImageOps { /// Apply a gaussian blur to `region` with the given `sigma`. - fn apply_gaussian_blur(&mut self, region: &BoundingBoxU32, sigma: f32); + fn apply_gaussian_blur(&mut self, region: &BoundingBoxPixel, sigma: f32); /// Fill `region` with a solid RGBA `color`. - fn apply_block_overlay(&mut self, region: &BoundingBoxU32, color: [u8; 4]); + fn apply_block_overlay(&mut self, region: &BoundingBoxPixel, color: [u8; 4]); /// Pixelate `region` with the given `block_size`. - fn apply_pixelate(&mut self, region: &BoundingBoxU32, block_size: u32); + fn apply_pixelate(&mut self, region: &BoundingBoxPixel, block_size: u32); } impl ImageOps for DynamicImage { - fn apply_gaussian_blur(&mut self, region: &BoundingBoxU32, sigma: f32) { + fn apply_gaussian_blur(&mut self, region: &BoundingBoxPixel, sigma: f32) { let (x, y, w, h) = (region.x, region.y, region.width, region.height); let img_w = self.width(); @@ -40,7 +40,7 @@ impl ImageOps for DynamicImage { image::imageops::overlay(self, &blurred, x as i64, y as i64); } - fn apply_block_overlay(&mut self, region: &BoundingBoxU32, color: [u8; 4]) { + fn apply_block_overlay(&mut self, region: &BoundingBoxPixel, color: [u8; 4]) { let (x, y, w, h) = (region.x, region.y, region.width, region.height); let img_w = self.width(); @@ -55,7 +55,7 @@ impl ImageOps for DynamicImage { image::imageops::overlay(self, &block, x as i64, y as i64); } - fn apply_pixelate(&mut self, region: &BoundingBoxU32, block_size: u32) { + fn apply_pixelate(&mut self, region: &BoundingBoxPixel, block_size: u32) { let block_size = block_size.max(1); let (x, y, w, h) = (region.x, region.y, region.width, region.height); diff --git a/crates/nvisy-codec/src/transform/image/transform.rs b/crates/nvisy-codec/src/transform/image/transform.rs index 8b745c11..e7f2c160 100644 --- a/crates/nvisy-codec/src/transform/image/transform.rs +++ b/crates/nvisy-codec/src/transform/image/transform.rs @@ -51,7 +51,7 @@ where let mut img: DynamicImage = image_data.into_inner(); for redaction in redactions { - let region = redaction.bounding_box.to_u32(); + let region = redaction.bounding_box.to_pixel(); match &redaction.output { ImageOutput::Blur { sigma } => { img.apply_gaussian_blur(®ion, *sigma); diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs index 722f96da..70ccffed 100644 --- a/crates/nvisy-core/src/math/bounding_box.rs +++ b/crates/nvisy-core/src/math/bounding_box.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; /// Axis-aligned bounding box for image-based entity locations. /// /// Coordinates are `f64` to support both pixel and normalized (0.0–1.0) -/// values from detection models. Use [`BoundingBoxU32`] (or [`Into`]) +/// values from detection models. Use [`BoundingBoxPixel`] (or [`Into`]) /// when integer pixel coordinates are needed for rendering. #[derive(Debug, Clone, Copy, Default, PartialEq)] #[derive(Serialize, Deserialize, JsonSchema)] @@ -111,8 +111,8 @@ impl BoundingBox { } /// Convert to integer pixel coordinates by rounding each field. - pub fn to_u32(&self) -> BoundingBoxU32 { - BoundingBoxU32 { + pub fn to_pixel(&self) -> BoundingBoxPixel { + BoundingBoxPixel { x: self.x.round() as u32, y: self.y.round() as u32, width: self.width.round() as u32, @@ -127,7 +127,7 @@ impl BoundingBox { /// integer. Use this at the rendering boundary where pixel-exact /// coordinates are required. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct BoundingBoxU32 { +pub struct BoundingBoxPixel { /// Horizontal offset of the top-left corner in pixels. pub x: u32, /// Vertical offset of the top-left corner in pixels. @@ -138,13 +138,13 @@ pub struct BoundingBoxU32 { pub height: u32, } -impl From<&BoundingBox> for BoundingBoxU32 { +impl From<&BoundingBox> for BoundingBoxPixel { fn from(bb: &BoundingBox) -> Self { - bb.to_u32() + bb.to_pixel() } } -impl From for BoundingBoxU32 { +impl From for BoundingBoxPixel { fn from(bb: BoundingBox) -> Self { Self::from(&bb) } @@ -221,9 +221,9 @@ mod tests { } #[test] - fn to_u32_rounds() { + fn to_pixel_rounds() { let bb = BoundingBox::new(1.4, 2.6, 3.5, 4.4); - let u = bb.to_u32(); + let u = bb.to_pixel(); assert_eq!(u.x, 1); assert_eq!(u.y, 3); assert_eq!(u.width, 4); diff --git a/crates/nvisy-core/src/math/mod.rs b/crates/nvisy-core/src/math/mod.rs index 0bd098be..0c677122 100644 --- a/crates/nvisy-core/src/math/mod.rs +++ b/crates/nvisy-core/src/math/mod.rs @@ -8,7 +8,7 @@ mod dpi; mod polygon; mod time_span; -pub use bounding_box::{BoundingBox, BoundingBoxU32}; +pub use bounding_box::{BoundingBox, BoundingBoxPixel}; pub use dpi::Dpi; pub use polygon::{Polygon, Vertex}; pub use time_span::TimeSpan; From 9776ee728cadd753210e9d4d392abfc3cf5d8faa Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 8 Mar 2026 16:05:16 +0100 Subject: [PATCH 5/8] feat(ocr): parse page number and image bbox from Surya response Use the upstream `page` and `image_bbox` fields instead of deriving page number from enumeration index and leaving dimensions empty. Co-Authored-By: Claude Opus 4.6 --- .../src/provider/datalab_surya/backend.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs index 497cf94e..af10c159 100644 --- a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs +++ b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs @@ -49,6 +49,10 @@ struct SuryaResponse { #[derive(Debug, Deserialize)] struct SuryaPage { + /// Upstream page number (0-based). + page: u32, + /// Document image bounds `[x_min, y_min, x_max, y_max]`. + image_bbox: [f64; 4], text_lines: Vec, } @@ -94,7 +98,7 @@ impl Backend for SuryaBackend { let threshold = params.confidence_threshold; let mut output = ImageOutput::new(image.source.derive()); - for (page_idx, surya_page) in parsed.pages.iter().enumerate() { + for surya_page in &parsed.pages { let mut lines = Vec::new(); for text_line in &surya_page.text_lines { @@ -159,10 +163,12 @@ impl Backend for SuryaBackend { let block_bbox = BoundingBox::enclosing(lines.iter().map(|l| &l.bbox)); + let [_x_min, _y_min, x_max, y_max] = surya_page.image_bbox; + output.pages.push(Page { - page_number: (page_idx + 1) as u32, - width: None, - height: None, + page_number: surya_page.page + 1, + width: Some(x_max), + height: Some(y_max), blocks: vec![Block { text: block_text, confidence: None, @@ -186,6 +192,8 @@ mod tests { fn parse_response() { let json = serde_json::json!({ "pages": [{ + "page": 0, + "image_bbox": [0.0, 0.0, 800.0, 600.0], "text_lines": [{ "words": [ { From 8a9c941da0a1d62ddd14fb343ed77fad3f8aec13 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 9 Mar 2026 06:11:25 +0100 Subject: [PATCH 6/8] refactor(core): split DocumentType into typed sub-enums Replace the flat DocumentType enum with nested format-specific enums: ImageFormat, WordFormat, PresentationFormat, SpreadsheetFormat, AudioFormat, and TextFormat. Pdf and Html remain standalone variants. Each sub-enum owns its own from_mime/mime_type methods, keeping DocumentType::from_mime as a concise chain of delegates. Remove Archive from ContentKind. Unify nvisy-ocr's ImageFormat with the new core ImageFormat. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 - crates/nvisy-codec/src/document/any.rs | 8 +- crates/nvisy-codec/src/document/mod.rs | 5 +- .../src/handler/audio/audio_handler.rs | 10 +- .../src/handler/audio/mp3_handler.rs | 4 +- .../src/handler/audio/wav_handler.rs | 4 +- .../src/handler/image/image_handler.rs | 10 +- .../src/handler/image/jpeg_handler.rs | 2 +- .../src/handler/image/png_handler.rs | 2 +- .../src/handler/rich/docx_handler.rs | 4 +- .../src/handler/text/csv_handler.rs | 4 +- .../src/handler/text/csv_loader.rs | 4 +- .../src/handler/text/json_handler.rs | 4 +- .../src/handler/text/json_loader.rs | 4 +- .../src/handler/text/text_handler.rs | 5 +- .../src/handler/text/txt_handler.rs | 4 +- .../src/handler/text/txt_loader.rs | 4 +- .../src/handler/text/xlsx_handler.rs | 4 +- crates/nvisy-core/src/fs/content_kind.rs | 12 - crates/nvisy-core/src/fs/document_type.rs | 361 ++++++++++++------ crates/nvisy-core/src/fs/mod.rs | 5 +- crates/nvisy-ocr/Cargo.toml | 3 - crates/nvisy-ocr/src/backend/input.rs | 32 +- crates/nvisy-ocr/src/backend/mod.rs | 3 +- 24 files changed, 311 insertions(+), 188 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index da0dafc0..cbbbb4d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2965,7 +2965,6 @@ dependencies = [ "serde", "serde_json", "sha2", - "strum", "tokio", "tracing", ] diff --git a/crates/nvisy-codec/src/document/any.rs b/crates/nvisy-codec/src/document/any.rs index 521e3ed6..47130b6f 100644 --- a/crates/nvisy-codec/src/document/any.rs +++ b/crates/nvisy-codec/src/document/any.rs @@ -170,6 +170,8 @@ impl From> for AnyDocument { #[cfg(test)] mod tests { + use nvisy_core::fs::{AudioFormat, ImageFormat, TextFormat}; + use super::*; #[test] @@ -177,7 +179,7 @@ mod tests { let doc = Document::new(TxtHandler::new(vec!["hello".into()], false)); let any: AnyDocument = doc.into(); assert!(any.as_text().is_some()); - assert_eq!(any.document_type(), DocumentType::Txt); + assert_eq!(any.document_type(), DocumentType::Text(TextFormat::Txt)); } #[test] @@ -186,7 +188,7 @@ mod tests { let doc = Document::new(PngHandler::new(img)); let any: AnyDocument = doc.into(); assert!(any.as_image().is_some()); - assert_eq!(any.document_type(), DocumentType::Png); + assert_eq!(any.document_type(), DocumentType::Image(ImageFormat::Png)); } #[test] @@ -194,7 +196,7 @@ mod tests { let doc = Document::new(WavHandler::new(bytes::Bytes::from_static(b"wav"))); let any: AnyDocument = doc.into(); assert!(any.as_audio().is_some()); - assert_eq!(any.document_type(), DocumentType::Wav); + assert_eq!(any.document_type(), DocumentType::Audio(AudioFormat::Wav)); } #[test] diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index 2ec54b8e..44f86315 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -178,7 +178,10 @@ mod tests { fn document_type_delegates() { let handler = TxtHandler::new(vec![], false); let doc = Document::new(handler); - assert_eq!(doc.document_type(), DocumentType::Txt); + assert_eq!( + doc.document_type(), + DocumentType::Text(nvisy_core::fs::TextFormat::Txt), + ); } #[test] diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler.rs b/crates/nvisy-codec/src/handler/audio/audio_handler.rs index b7a4e118..e6f3a6ad 100644 --- a/crates/nvisy-codec/src/handler/audio/audio_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/audio_handler.rs @@ -104,7 +104,10 @@ mod tests { #[tokio::test] async fn wav_variant_delegates() { let h = AnyAudio::Wav(WavHandler::new(bytes::Bytes::from_static(b"wav-data"))); - assert_eq!(h.document_type(), DocumentType::Wav); + assert_eq!( + h.document_type(), + DocumentType::Audio(nvisy_core::fs::AudioFormat::Wav), + ); let spans: Vec<_> = h.audio_spans().await.collect().await; assert_eq!(spans.len(), 1); assert_eq!(spans[0].data.as_bytes().as_ref(), b"wav-data"); @@ -113,7 +116,10 @@ mod tests { #[tokio::test] async fn mp3_variant_delegates() { let h = AnyAudio::Mp3(Mp3Handler::new(bytes::Bytes::from_static(b"mp3-data"))); - assert_eq!(h.document_type(), DocumentType::Mp3); + assert_eq!( + h.document_type(), + DocumentType::Audio(nvisy_core::fs::AudioFormat::Mp3), + ); assert_eq!(h.encode().unwrap().as_bytes(), b"mp3-data"); } diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index 623e0799..a894880c 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -10,7 +10,7 @@ use bytes::Bytes; use futures::StreamExt; use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{AudioFormat, DocumentType}; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; @@ -45,7 +45,7 @@ impl Mp3Handler { impl Handler for Mp3Handler { fn document_type(&self) -> DocumentType { - DocumentType::Mp3 + DocumentType::Audio(AudioFormat::Mp3) } #[tracing::instrument(name = "mp3.encode", skip_all, fields(output_bytes))] diff --git a/crates/nvisy-codec/src/handler/audio/wav_handler.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs index 5d769bc4..9f689e80 100644 --- a/crates/nvisy-codec/src/handler/audio/wav_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/wav_handler.rs @@ -10,7 +10,7 @@ use bytes::Bytes; use futures::StreamExt; use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{AudioFormat, DocumentType}; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; @@ -45,7 +45,7 @@ impl WavHandler { impl Handler for WavHandler { fn document_type(&self) -> DocumentType { - DocumentType::Wav + DocumentType::Audio(AudioFormat::Wav) } #[tracing::instrument(name = "wav.encode", skip_all, fields(output_bytes))] diff --git a/crates/nvisy-codec/src/handler/image/image_handler.rs b/crates/nvisy-codec/src/handler/image/image_handler.rs index 97fd54b1..b1089420 100644 --- a/crates/nvisy-codec/src/handler/image/image_handler.rs +++ b/crates/nvisy-codec/src/handler/image/image_handler.rs @@ -113,13 +113,19 @@ mod tests { #[test] fn png_variant_document_type() { let h = AnyImage::Png(make_png()); - assert_eq!(h.document_type(), DocumentType::Png); + assert_eq!( + h.document_type(), + DocumentType::Image(nvisy_core::fs::ImageFormat::Png), + ); } #[test] fn jpeg_variant_document_type() { let h = AnyImage::Jpeg(make_jpeg()); - assert_eq!(h.document_type(), DocumentType::Jpeg); + assert_eq!( + h.document_type(), + DocumentType::Image(nvisy_core::fs::ImageFormat::Jpeg), + ); } #[tokio::test] diff --git a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs index aad853f5..07067513 100644 --- a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs +++ b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs @@ -22,7 +22,7 @@ pub struct JpegHandler { impl_image_handler!( JpegHandler, - nvisy_core::fs::DocumentType::Jpeg, + nvisy_core::fs::DocumentType::Image(nvisy_core::fs::ImageFormat::Jpeg), image::ImageFormat::Jpeg, "jpeg-handler", "jpeg.encode" diff --git a/crates/nvisy-codec/src/handler/image/png_handler.rs b/crates/nvisy-codec/src/handler/image/png_handler.rs index c086f85b..922e5aed 100644 --- a/crates/nvisy-codec/src/handler/image/png_handler.rs +++ b/crates/nvisy-codec/src/handler/image/png_handler.rs @@ -22,7 +22,7 @@ pub struct PngHandler { impl_image_handler!( PngHandler, - nvisy_core::fs::DocumentType::Png, + nvisy_core::fs::DocumentType::Image(nvisy_core::fs::ImageFormat::Png), image::ImageFormat::Png, "png-handler", "png.encode" diff --git a/crates/nvisy-codec/src/handler/rich/docx_handler.rs b/crates/nvisy-codec/src/handler/rich/docx_handler.rs index d3e1f45a..f317be7b 100644 --- a/crates/nvisy-codec/src/handler/rich/docx_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/docx_handler.rs @@ -1,7 +1,7 @@ //! DOCX handler (stub: awaiting migration to full Loader/Handler pattern). use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{DocumentType, WordFormat}; use nvisy_core::io::ContentData; use crate::document::{SpanEditStream, SpanStream}; @@ -14,7 +14,7 @@ pub struct DocxHandler; impl Handler for DocxHandler { fn document_type(&self) -> DocumentType { - DocumentType::Docx + DocumentType::Word(WordFormat::Docx) } #[tracing::instrument(name = "docx.encode", skip_all)] diff --git a/crates/nvisy-codec/src/handler/text/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs index e13c3123..f1c5f308 100644 --- a/crates/nvisy-codec/src/handler/text/csv_handler.rs +++ b/crates/nvisy-codec/src/handler/text/csv_handler.rs @@ -17,7 +17,7 @@ use futures::StreamExt; use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{DocumentType, SpreadsheetFormat}; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; @@ -84,7 +84,7 @@ pub struct CsvHandler { impl Handler for CsvHandler { fn document_type(&self) -> DocumentType { - DocumentType::Csv + DocumentType::Spreadsheet(SpreadsheetFormat::Csv) } #[tracing::instrument(name = "csv.encode", skip_all, fields(output_bytes))] diff --git a/crates/nvisy-codec/src/handler/text/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs index af520dbf..a3d62a44 100644 --- a/crates/nvisy-codec/src/handler/text/csv_loader.rs +++ b/crates/nvisy-codec/src/handler/text/csv_loader.rs @@ -137,7 +137,7 @@ mod tests { use bytes::Bytes; use futures::StreamExt; use nvisy_core::Error; - use nvisy_core::fs::DocumentType; + use nvisy_core::fs::{DocumentType, SpreadsheetFormat}; use nvisy_core::path::ContentSource; use super::*; @@ -151,7 +151,7 @@ mod tests { let content = content_from_str("name,age\nAlice,30\nBob,25\n"); let doc = CsvLoader.decode(&content, &CsvParams::default()).await?; - assert_eq!(doc.document_type(), DocumentType::Csv); + assert_eq!(doc.document_type(), DocumentType::Spreadsheet(SpreadsheetFormat::Csv)); assert_eq!( doc.headers(), Some(["name", "age"].map(String::from).as_slice()) diff --git a/crates/nvisy-codec/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs index 31738f03..c76c069c 100644 --- a/crates/nvisy-codec/src/handler/text/json_handler.rs +++ b/crates/nvisy-codec/src/handler/text/json_handler.rs @@ -23,7 +23,7 @@ use std::num::NonZeroU32; use futures::StreamExt; use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{DocumentType, TextFormat}; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; use serde::{Deserialize, Serialize}; @@ -127,7 +127,7 @@ pub struct JsonHandler { impl Handler for JsonHandler { fn document_type(&self) -> DocumentType { - DocumentType::Json + DocumentType::Text(TextFormat::Json) } #[tracing::instrument(name = "json.encode", skip_all, fields(output_bytes))] diff --git a/crates/nvisy-codec/src/handler/text/json_loader.rs b/crates/nvisy-codec/src/handler/text/json_loader.rs index a69de583..a0279bbd 100644 --- a/crates/nvisy-codec/src/handler/text/json_loader.rs +++ b/crates/nvisy-codec/src/handler/text/json_loader.rs @@ -94,7 +94,7 @@ fn detect_formatting(source: &str) -> (JsonIndent, bool) { mod tests { use bytes::Bytes; use nvisy_core::Error; - use nvisy_core::fs::DocumentType; + use nvisy_core::fs::{DocumentType, TextFormat}; use nvisy_core::path::ContentSource; use serde_json::json; @@ -109,7 +109,7 @@ mod tests { let content = content_from_str(r#"{"name": "Alice", "age": 30}"#); let doc = JsonLoader.decode(&content, &JsonParams::default()).await?; - assert_eq!(doc.document_type(), DocumentType::Json); + assert_eq!(doc.document_type(), DocumentType::Text(TextFormat::Json)); assert_eq!(doc.value(), &json!({"name": "Alice", "age": 30})); Ok(()) } diff --git a/crates/nvisy-codec/src/handler/text/text_handler.rs b/crates/nvisy-codec/src/handler/text/text_handler.rs index de1ac44a..621165e9 100644 --- a/crates/nvisy-codec/src/handler/text/text_handler.rs +++ b/crates/nvisy-codec/src/handler/text/text_handler.rs @@ -222,7 +222,10 @@ mod tests { #[test] fn txt_variant_document_type() { let h = AnyText::Txt(TxtHandler::new(vec!["hello".into()], false)); - assert_eq!(h.document_type(), DocumentType::Txt); + assert_eq!( + h.document_type(), + DocumentType::Text(nvisy_core::fs::TextFormat::Txt), + ); } #[tokio::test] diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index a0c09754..52615f03 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -16,7 +16,7 @@ use futures::StreamExt; use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{DocumentType, TextFormat}; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; @@ -40,7 +40,7 @@ pub struct TxtHandler { impl Handler for TxtHandler { fn document_type(&self) -> DocumentType { - DocumentType::Txt + DocumentType::Text(TextFormat::Txt) } #[tracing::instrument(name = "txt.encode", skip_all, fields(output_bytes))] diff --git a/crates/nvisy-codec/src/handler/text/txt_loader.rs b/crates/nvisy-codec/src/handler/text/txt_loader.rs index b937517a..5840c977 100644 --- a/crates/nvisy-codec/src/handler/text/txt_loader.rs +++ b/crates/nvisy-codec/src/handler/text/txt_loader.rs @@ -53,7 +53,7 @@ mod tests { use bytes::Bytes; use futures::StreamExt; use nvisy_core::Error; - use nvisy_core::fs::DocumentType; + use nvisy_core::fs::{DocumentType, TextFormat}; use nvisy_core::path::ContentSource; use super::*; @@ -67,7 +67,7 @@ mod tests { let content = content_from_str("hello\nworld\n"); let doc = TxtLoader.decode(&content, &TxtParams::default()).await?; - assert_eq!(doc.document_type(), DocumentType::Txt); + assert_eq!(doc.document_type(), DocumentType::Text(TextFormat::Txt)); assert_eq!(doc.lines(), &["hello", "world"]); assert!(doc.trailing_newline()); Ok(()) diff --git a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs index 9e1615d8..2bdcd202 100644 --- a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs +++ b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs @@ -1,7 +1,7 @@ //! XLSX handler (stub: awaiting full spreadsheet support). use nvisy_core::Error; -use nvisy_core::fs::DocumentType; +use nvisy_core::fs::{DocumentType, SpreadsheetFormat}; use nvisy_core::io::ContentData; use crate::document::{SpanEditStream, SpanStream}; @@ -13,7 +13,7 @@ pub struct XlsxHandler; impl Handler for XlsxHandler { fn document_type(&self) -> DocumentType { - DocumentType::Xlsx + DocumentType::Spreadsheet(SpreadsheetFormat::Xlsx) } #[tracing::instrument(name = "xlsx.encode", skip_all)] diff --git a/crates/nvisy-core/src/fs/content_kind.rs b/crates/nvisy-core/src/fs/content_kind.rs index ce35cf75..7f0515e3 100644 --- a/crates/nvisy-core/src/fs/content_kind.rs +++ b/crates/nvisy-core/src/fs/content_kind.rs @@ -38,8 +38,6 @@ pub enum ContentKind { Spreadsheet, /// Image files Image, - /// Archive files (ZIP, TAR, etc.) - Archive, /// Unknown or unsupported content type #[default] Unknown, @@ -69,12 +67,6 @@ impl ContentKind { pub fn is_image(&self) -> bool { matches!(self, Self::Image) } - - /// Check if this content kind represents an archive - #[must_use] - pub fn is_archive(&self) -> bool { - matches!(self, Self::Archive) - } } #[cfg(test)] @@ -94,9 +86,6 @@ mod tests { assert!(ContentKind::Image.is_image()); assert!(!ContentKind::Text.is_image()); - - assert!(ContentKind::Archive.is_archive()); - assert!(!ContentKind::Document.is_archive()); } #[test] @@ -105,7 +94,6 @@ mod tests { assert_eq!(ContentKind::Document.to_string(), "document"); assert_eq!(ContentKind::Spreadsheet.to_string(), "spreadsheet"); assert_eq!(ContentKind::Image.to_string(), "image"); - assert_eq!(ContentKind::Archive.to_string(), "archive"); assert_eq!(ContentKind::Unknown.to_string(), "unknown"); } diff --git a/crates/nvisy-core/src/fs/document_type.rs b/crates/nvisy-core/src/fs/document_type.rs index 1b7ce30c..8ef7c7ac 100644 --- a/crates/nvisy-core/src/fs/document_type.rs +++ b/crates/nvisy-core/src/fs/document_type.rs @@ -1,143 +1,288 @@ //! Document format classification. +use std::fmt; + use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use strum::{Display, EnumString}; +use strum::{Display, EnumString, IntoStaticStr}; -/// Document format that content can be classified as. -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - Hash, - Display, - EnumString, - Serialize, - Deserialize, - JsonSchema -)] +/// Image file format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] -pub enum DocumentType { - /// Plain text (`.txt`, `.log`, etc.). - Txt, - /// Comma-separated values. - Csv, - /// JSON data. - Json, - /// HTML pages. - Html, - /// PDF documents. - Pdf, - /// Microsoft Word (`.docx`). - Docx, - /// Microsoft Excel (`.xlsx`). - Xlsx, - /// PNG image. +pub enum ImageFormat { Png, - /// JPEG image. Jpeg, - /// WAV audio. + Webp, + Gif, + Tiff, +} + +impl ImageFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Png => "image/png", + Self::Jpeg => "image/jpeg", + Self::Webp => "image/webp", + Self::Gif => "image/gif", + Self::Tiff => "image/tiff", + } + } + + /// File extension (without leading dot). + pub fn extension(self) -> &'static str { + match self { + Self::Png => "png", + Self::Jpeg => "jpeg", + Self::Webp => "webp", + Self::Gif => "gif", + Self::Tiff => "tiff", + } + } + + /// Parse from a MIME type string. + pub fn from_mime(mime: &str) -> Option { + match mime { + "image/png" => Some(Self::Png), + "image/jpeg" => Some(Self::Jpeg), + "image/webp" => Some(Self::Webp), + "image/gif" => Some(Self::Gif), + "image/tiff" => Some(Self::Tiff), + _ => None, + } + } +} + +/// Word-processor document format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum WordFormat { + Doc, + Docx, + Odt, +} + +impl WordFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Doc => "application/msword", + Self::Docx => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + Self::Odt => "application/vnd.oasis.opendocument.text", + } + } + + /// Parse from a MIME type string. + pub fn from_mime(mime: &str) -> Option { + match mime { + "application/msword" => Some(Self::Doc), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => Some(Self::Docx), + "application/vnd.oasis.opendocument.text" => Some(Self::Odt), + _ => None, + } + } +} + +/// Presentation document format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum PresentationFormat { + Ppt, + Pptx, + Odp, +} + +impl PresentationFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Ppt => "application/vnd.ms-powerpoint", + Self::Pptx => "application/vnd.openxmlformats-officedocument.presentationml.presentation", + Self::Odp => "application/vnd.oasis.opendocument.presentation", + } + } + + /// Parse from a MIME type string. + pub fn from_mime(mime: &str) -> Option { + match mime { + "application/vnd.ms-powerpoint" => Some(Self::Ppt), + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => Some(Self::Pptx), + "application/vnd.oasis.opendocument.presentation" => Some(Self::Odp), + _ => None, + } + } +} + +/// Spreadsheet document format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum SpreadsheetFormat { + Xls, + Xlsx, + Xlsm, + Xltx, + Csv, + Ods, +} + +impl SpreadsheetFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Xls => "application/vnd.ms-excel", + Self::Xlsx => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + Self::Xlsm => "application/vnd.ms-excel.sheet.macroEnabled.12", + Self::Xltx => "application/vnd.openxmlformats-officedocument.spreadsheetml.template", + Self::Csv => "text/csv", + Self::Ods => "application/vnd.oasis.opendocument.spreadsheet", + } + } + + /// Parse from a MIME type string. + pub fn from_mime(mime: &str) -> Option { + match mime { + "application/vnd.ms-excel" => Some(Self::Xls), + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => Some(Self::Xlsx), + "application/vnd.ms-excel.sheet.macroEnabled.12" => Some(Self::Xlsm), + "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => Some(Self::Xltx), + "text/csv" => Some(Self::Csv), + "application/vnd.oasis.opendocument.spreadsheet" => Some(Self::Ods), + _ => None, + } + } +} + +/// Audio file format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum AudioFormat { Wav, - /// MP3 audio. Mp3, } -impl DocumentType { - /// Map a MIME type string to a [`DocumentType`]. - /// - /// Returns `None` for unrecognised MIME types. +impl AudioFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Wav => "audio/wav", + Self::Mp3 => "audio/mpeg", + } + } + + /// Parse from a MIME type string. + pub fn from_mime(mime: &str) -> Option { + match mime { + "audio/wav" | "audio/x-wav" => Some(Self::Wav), + "audio/mpeg" => Some(Self::Mp3), + _ => None, + } + } +} + +/// Plain text format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum TextFormat { + Txt, + Log, + Json, +} + +impl TextFormat { + /// MIME type string for this format. + pub fn mime_type(self) -> &'static str { + match self { + Self::Txt | Self::Log => "text/plain", + Self::Json => "application/json", + } + } + + /// Parse from a MIME type string. pub fn from_mime(mime: &str) -> Option { match mime { "text/plain" => Some(Self::Txt), - "text/csv" => Some(Self::Csv), "application/json" => Some(Self::Json), - "text/html" => Some(Self::Html), - "image/png" => Some(Self::Png), - "image/jpeg" => Some(Self::Jpeg), - "audio/x-wav" | "audio/wav" => Some(Self::Wav), - "audio/mpeg" => Some(Self::Mp3), - "application/pdf" => Some(Self::Pdf), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => { - Some(Self::Docx) - } - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => Some(Self::Xlsx), _ => None, } } } +/// Document format that content can be classified as. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum DocumentType { + Text(TextFormat), + Image(ImageFormat), + Word(WordFormat), + Presentation(PresentationFormat), + Spreadsheet(SpreadsheetFormat), + Audio(AudioFormat), + Html, + Pdf, +} + +impl fmt::Display for DocumentType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Text(fmt_) => write!(f, "{fmt_}"), + Self::Image(fmt_) => write!(f, "{fmt_}"), + Self::Word(fmt_) => write!(f, "{fmt_}"), + Self::Presentation(fmt_) => write!(f, "{fmt_}"), + Self::Spreadsheet(fmt_) => write!(f, "{fmt_}"), + Self::Audio(fmt_) => write!(f, "{fmt_}"), + Self::Html => write!(f, "html"), + Self::Pdf => write!(f, "pdf"), + } + } +} + +impl DocumentType { + /// Map a MIME type string to a [`DocumentType`]. + /// + /// Returns `None` for unrecognised MIME types. + pub fn from_mime(mime: &str) -> Option { + None.or_else(|| TextFormat::from_mime(mime).map(Self::Text)) + .or_else(|| ImageFormat::from_mime(mime).map(Self::Image)) + .or_else(|| WordFormat::from_mime(mime).map(Self::Word)) + .or_else(|| PresentationFormat::from_mime(mime).map(Self::Presentation)) + .or_else(|| SpreadsheetFormat::from_mime(mime).map(Self::Spreadsheet)) + .or_else(|| AudioFormat::from_mime(mime).map(Self::Audio)) + .or_else(|| match mime { + "text/html" => Some(Self::Html), + "application/pdf" => Some(Self::Pdf), + _ => None, + }) + } +} + #[cfg(test)] mod tests { use super::*; #[test] - fn test_from_mime_text_types() { - assert_eq!( - DocumentType::from_mime("text/plain"), - Some(DocumentType::Txt) - ); - assert_eq!(DocumentType::from_mime("text/csv"), Some(DocumentType::Csv)); - assert_eq!( - DocumentType::from_mime("text/html"), - Some(DocumentType::Html) - ); - } - - #[test] - fn test_from_mime_application_types() { - assert_eq!( - DocumentType::from_mime("application/json"), - Some(DocumentType::Json) - ); - assert_eq!( - DocumentType::from_mime("application/pdf"), - Some(DocumentType::Pdf) - ); - assert_eq!( - DocumentType::from_mime( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ), - Some(DocumentType::Docx) - ); - assert_eq!( - DocumentType::from_mime( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ), - Some(DocumentType::Xlsx) - ); + fn from_mime_unknown_returns_none() { + assert_eq!(DocumentType::from_mime("application/octet-stream"), None); + assert_eq!(DocumentType::from_mime("video/mp4"), None); + assert_eq!(DocumentType::from_mime(""), None); } #[test] - fn test_from_mime_media_types() { - assert_eq!( - DocumentType::from_mime("image/png"), - Some(DocumentType::Png) - ); - assert_eq!( - DocumentType::from_mime("image/jpeg"), - Some(DocumentType::Jpeg) - ); - assert_eq!( - DocumentType::from_mime("audio/wav"), - Some(DocumentType::Wav) - ); + fn from_mime_alias() { assert_eq!( DocumentType::from_mime("audio/x-wav"), - Some(DocumentType::Wav) - ); - assert_eq!( - DocumentType::from_mime("audio/mpeg"), - Some(DocumentType::Mp3) + Some(DocumentType::Audio(AudioFormat::Wav)), ); } - - #[test] - fn test_from_mime_unknown() { - assert_eq!(DocumentType::from_mime("application/octet-stream"), None); - assert_eq!(DocumentType::from_mime("video/mp4"), None); - assert_eq!(DocumentType::from_mime(""), None); - } } diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs index 5085d438..05cddbb0 100644 --- a/crates/nvisy-core/src/fs/mod.rs +++ b/crates/nvisy-core/src/fs/mod.rs @@ -12,4 +12,7 @@ mod document_type; pub use content_kind::ContentKind; pub use content_metadata::ContentMetadata; -pub use document_type::DocumentType; +pub use document_type::{ + AudioFormat, DocumentType, ImageFormat, PresentationFormat, SpreadsheetFormat, TextFormat, + WordFormat, +}; diff --git a/crates/nvisy-ocr/Cargo.toml b/crates/nvisy-ocr/Cargo.toml index 44926bf8..a3b75994 100644 --- a/crates/nvisy-ocr/Cargo.toml +++ b/crates/nvisy-ocr/Cargo.toml @@ -43,9 +43,6 @@ bytes = { workspace = true, features = [] } sha2 = { workspace = true, optional = true, features = [] } hmac = { workspace = true, optional = true, features = [] } -# Derive macros and error handling -strum = { workspace = true, features = [] } - # (De)serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-ocr/src/backend/input.rs b/crates/nvisy-ocr/src/backend/input.rs index b794081c..723e21c1 100644 --- a/crates/nvisy-ocr/src/backend/input.rs +++ b/crates/nvisy-ocr/src/backend/input.rs @@ -3,38 +3,8 @@ use base64::Engine as _; use base64::engine::general_purpose::STANDARD as BASE64; use bytes::Bytes; +use nvisy_core::fs::ImageFormat; use nvisy_core::path::ContentSource; -use strum::{Display, EnumString, IntoStaticStr}; - -/// Image format passed to a [`Backend`]. -/// -/// [`Backend`]: super::Backend -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(Display, EnumString, IntoStaticStr)] -pub enum ImageFormat { - #[strum(serialize = "png")] - Png, - #[strum(serialize = "jpeg")] - Jpeg, -} - -impl ImageFormat { - /// MIME type string for this format. - pub fn mime_type(self) -> &'static str { - match self { - Self::Png => "image/png", - Self::Jpeg => "image/jpeg", - } - } - - /// File extension for this format (without leading dot). - pub fn extension(self) -> &'static str { - match self { - Self::Png => "png", - Self::Jpeg => "jpeg", - } - } -} /// Image payload passed to [`Backend::run`]. /// diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs index 936668ad..00fa9518 100644 --- a/crates/nvisy-ocr/src/backend/mod.rs +++ b/crates/nvisy-ocr/src/backend/mod.rs @@ -3,7 +3,8 @@ mod input; mod output; -pub use input::{ImageFormat, ImageInput}; +pub use input::ImageInput; +pub use nvisy_core::fs::ImageFormat; use nvisy_core::Error; pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word}; use reqwest_middleware::reqwest::Response; From 0dd104004820205da86f892654460b731a508a38 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 9 Mar 2026 14:54:16 +0100 Subject: [PATCH 7/8] chore(deps): upgrade rig-core from 0.31 to 0.32 Remove turbofish type parameters from Client::builder() calls to match the new builder API where the HTTP client type is set via .http_client() rather than as a generic parameter. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 63 ++++++++++++++----- Cargo.toml | 2 +- .../src/backend/provider/authenticated.rs | 6 +- .../src/backend/provider/unauthenticated.rs | 2 +- 4 files changed, 54 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cbbbb4d8..d2bc2dff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -769,20 +769,21 @@ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" [[package]] name = "convert_case" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f" +checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9" dependencies = [ "unicode-segmentation", ] [[package]] -name = "convert_case" -version = "0.10.0" +name = "core-foundation" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ - "unicode-segmentation", + "core-foundation-sys", + "libc", ] [[package]] @@ -1125,7 +1126,7 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" dependencies = [ - "convert_case 0.10.0", + "convert_case", "proc-macro2", "quote", "rustc_version", @@ -1902,9 +1903,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3979,9 +3982,9 @@ checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" [[package]] name = "rig-core" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "437fa2a15825caf2505411bbe55b05c8eb122e03934938b38f9ecaa1d6ded7c8" +checksum = "24eb001344690ad016a095c6384b09b93ea12551490b4ed1a197058aeac990d6" dependencies = [ "as-any", "async-stream", @@ -4013,11 +4016,11 @@ dependencies = [ [[package]] name = "rig-derive" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f4b48f1449fa214d5cb11d0d0d952fd4c13b7ca5d1eaac64c87ce03cfb9e24" +checksum = "3b6d9818c9cb13d00664b52fd3e47b0554bc2d5c59cfb90340dd9411b09553bc" dependencies = [ - "convert_case 0.8.0", + "convert_case", "deluxe", "indoc", "proc-macro2", @@ -4133,7 +4136,7 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" dependencies = [ - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "jni", "log", @@ -4279,7 +4282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" dependencies = [ "bitflags", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -4709,6 +4712,27 @@ dependencies = [ "syn", ] +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "target-lexicon" version = "0.13.5" @@ -5593,6 +5617,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" diff --git a/Cargo.toml b/Cargo.toml index 62b89ab7..c6cba8b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ nvisy-rig = { path = "./crates/nvisy-rig", version = "0.1.0" } nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } # LLM framework -rig-core = { version = "0.31", features = [] } +rig-core = { version = "0.32", features = [] } # HTTP client and middleware reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "multipart"] } diff --git a/crates/nvisy-rig/src/backend/provider/authenticated.rs b/crates/nvisy-rig/src/backend/provider/authenticated.rs index 73dcb9c0..67d06aab 100644 --- a/crates/nvisy-rig/src/backend/provider/authenticated.rs +++ b/crates/nvisy-rig/src/backend/provider/authenticated.rs @@ -68,7 +68,7 @@ impl AuthenticatedProvider { &self, http: ClientWithMiddleware, ) -> Result, Error> { - let mut b = openai::Client::::builder() + let mut b = openai::Client::builder() .api_key(&self.api_key) .http_client(http); if let Some(url) = &self.base_url { @@ -84,7 +84,7 @@ impl AuthenticatedProvider { &self, http: ClientWithMiddleware, ) -> Result, Error> { - let mut b = gemini::Client::::builder() + let mut b = gemini::Client::builder() .api_key(&self.api_key) .http_client(http); if let Some(url) = &self.base_url { @@ -100,7 +100,7 @@ impl AuthenticatedProvider { &self, http: ClientWithMiddleware, ) -> Result, Error> { - let mut b = anthropic::Client::::builder() + let mut b = anthropic::Client::builder() .api_key(&self.api_key) .http_client(http); if let Some(url) = &self.base_url { diff --git a/crates/nvisy-rig/src/backend/provider/unauthenticated.rs b/crates/nvisy-rig/src/backend/provider/unauthenticated.rs index 13a49ee7..580ea5a2 100644 --- a/crates/nvisy-rig/src/backend/provider/unauthenticated.rs +++ b/crates/nvisy-rig/src/backend/provider/unauthenticated.rs @@ -19,7 +19,7 @@ impl UnauthenticatedProvider { &self, http: ClientWithMiddleware, ) -> Result, Error> { - let mut b = ollama::Client::::builder() + let mut b = ollama::Client::builder() .api_key(rig::client::Nothing) .http_client(http); if let Some(url) = &self.base_url { From 1ff233e088fe08b78dc2df4280a524ea224aeec1 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 9 Mar 2026 15:47:02 +0100 Subject: [PATCH 8/8] fix(core): use or instead of or_else for non-lazy match in from_mime Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-core/src/fs/document_type.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/nvisy-core/src/fs/document_type.rs b/crates/nvisy-core/src/fs/document_type.rs index 8ef7c7ac..f8bd7f5b 100644 --- a/crates/nvisy-core/src/fs/document_type.rs +++ b/crates/nvisy-core/src/fs/document_type.rs @@ -259,7 +259,7 @@ impl DocumentType { .or_else(|| PresentationFormat::from_mime(mime).map(Self::Presentation)) .or_else(|| SpreadsheetFormat::from_mime(mime).map(Self::Spreadsheet)) .or_else(|| AudioFormat::from_mime(mime).map(Self::Audio)) - .or_else(|| match mime { + .or(match mime { "text/html" => Some(Self::Html), "application/pdf" => Some(Self::Pdf), _ => None,