From e6886554e1c0bceb3f0164d88f4537f5c2ba94b1 Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Sun, 8 Mar 2026 07:51:26 +0100
Subject: [PATCH 1/8] feat(ocr): replace flat ImageRegion model with nested
 Page/Block/Line/Word tree

Preserves hierarchical structure from each provider's API instead of
discarding it during conversion. Adds BoundingBox::enclosing() to
nvisy-core and updates all six provider backends to build the tree.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crates/nvisy-core/src/math/bounding_box.rs    |  10 +
 crates/nvisy-ocr/src/backend/mod.rs           |   8 +-
 crates/nvisy-ocr/src/backend/output.rs        | 192 +++++++-------
 crates/nvisy-ocr/src/engine/mod.rs            |   8 +-
 crates/nvisy-ocr/src/lib.rs                   |   2 +-
 crates/nvisy-ocr/src/prelude.rs               |   4 +-
 .../src/provider/aws_textract/backend.rs      | 238 +++++++++++++++---
 .../src/provider/azure_docai/backend.rs       |  46 +++-
 .../src/provider/datalab_surya/backend.rs     |  63 ++++-
 .../src/provider/google_vision/backend.rs     | 129 +++++++---
 .../src/provider/mindee_doctr/backend.rs      |  49 +++-
 .../src/provider/paddle_paddlex/backend.rs    |  58 ++++-
 12 files changed, 606 insertions(+), 201 deletions(-)
diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs
index 7a86d0ad..29ec65a5 100644
--- a/crates/nvisy-core/src/math/bounding_box.rs
+++ b/crates/nvisy-core/src/math/bounding_box.rs
@@ -108,6 +108,16 @@ impl BoundingBox {
         if union == 0.0 { 0.0 } else { inter / union }
     }
 
+    /// Returns the smallest box enclosing all boxes in the iterator.
+    ///
+    /// Returns [`BoundingBox::default()`] if the iterator is empty.
+    pub fn enclosing<'a>(mut iter: impl Iterator<Item = &'a BoundingBox>) -> BoundingBox {
+        match iter.next() {
+            None => BoundingBox::default(),
+            Some(first) => iter.fold(*first, |acc, b| acc.union(b)),
+        }
+    }
+
     /// Convert to integer pixel coordinates by rounding each field.
     pub fn to_u32(&self) -> BoundingBoxU32 {
         BoundingBoxU32 {
diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs
index e9558a0f..936668ad 100644
--- a/crates/nvisy-ocr/src/backend/mod.rs
+++ b/crates/nvisy-ocr/src/backend/mod.rs
@@ -5,7 +5,7 @@ mod output;
 
 pub use input::{ImageFormat, ImageInput};
 use nvisy_core::Error;
-pub use output::{ImageOutput, ImageRegion, TextLevel};
+pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word};
 use reqwest_middleware::reqwest::Response;
 use reqwest_middleware::reqwest::multipart::Part;
 
@@ -61,11 +61,11 @@ impl RunParams {
 /// Backend trait for OCR providers.
 ///
 /// Implementations send image bytes to an OCR service and return
-/// typed [`ImageRegion`] results with word-level bounding boxes.
+/// hierarchical [`ImageOutput`] results with page/block/line/word structure.
 ///
 /// Confidence values **must** be normalised to 0.0..=1.0 before
-/// populating [`ImageRegion::confidence`]. Backends whose upstream
-/// API uses a different scale (e.g. AWS Textract returns 0–100) are
+/// populating [`Word::confidence`]. Backends whose upstream API uses
+/// a different scale (e.g. AWS Textract returns 0–100) are
 /// responsible for converting.
 #[async_trait::async_trait]
 pub trait Backend: Send + Sync + 'static {
diff --git a/crates/nvisy-ocr/src/backend/output.rs b/crates/nvisy-ocr/src/backend/output.rs
index 373d3dbf..45edaaa4 100644
--- a/crates/nvisy-ocr/src/backend/output.rs
+++ b/crates/nvisy-ocr/src/backend/output.rs
@@ -3,28 +3,11 @@
 use nvisy_core::math::{BoundingBox, Polygon};
 use nvisy_core::path::ContentSource;
 use serde::{Deserialize, Serialize};
-use strum::{Display, EnumString};
 
-/// Hierarchical level of a text region within a document page.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-#[derive(Display, EnumString, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-#[strum(serialize_all = "snake_case")]
-pub enum TextLevel {
-    /// Full page.
-    Page,
-    /// Block-level region (paragraph, table, figure).
-    Block,
-    /// Single line of text.
-    Line,
-    /// Individual word.
-    Word,
-}
-
-/// A single text region detected by an OCR backend.
+/// A single word detected by OCR.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImageRegion {
-    /// Extracted text content.
+pub struct Word {
+    /// Recognised text content.
     pub text: String,
     /// Confidence score (0.0..=1.0), if the backend provides one.
     pub confidence: Option<f64>,
@@ -32,36 +15,72 @@ pub struct ImageRegion {
     pub bbox: BoundingBox,
     /// Polygon vertices for rotated or skewed text regions.
     pub polygon: Option<Polygon>,
-    /// Hierarchical level of this text region: word, line, block, etc.
-    pub level: Option<TextLevel>,
 }
 
-impl ImageRegion {
-    /// Returns `true` if the extracted text is empty.
-    pub fn is_empty(&self) -> bool {
-        self.text.is_empty()
-    }
+/// A line of text: ordered sequence of words.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Line {
+    /// Concatenated text from all words in this line.
+    pub text: String,
+    /// Line-level confidence, if the provider gives one.
+    pub confidence: Option<f64>,
+    /// Axis-aligned bounding box enclosing the line.
+    pub bbox: BoundingBox,
+    /// Polygon vertices for the line region.
+    pub polygon: Option<Polygon>,
+    /// Words in reading order.
+    pub words: Vec<Word>,
+}
 
-    /// Length of the extracted text in bytes.
-    pub fn text_len(&self) -> usize {
-        self.text.len()
-    }
+/// Classification of a block region.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum BlockKind {
+    /// Paragraph / prose.
+    Text,
+    /// Tabular content.
+    Table,
+    /// Figure / chart.
+    Figure,
+    /// Unclassified.
+    Other,
+}
 
-    /// Area of the bounding box: width × height.
-    pub fn area(&self) -> f64 {
-        self.bbox.width * self.bbox.height
-    }
+/// A block (paragraph, table cell, figure caption, etc.).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Block {
+    /// Concatenated text from all lines in this block.
+    pub text: String,
+    /// Block-level confidence, if available.
+    pub confidence: Option<f64>,
+    /// Axis-aligned bounding box enclosing the block.
+    pub bbox: BoundingBox,
+    /// Polygon vertices for the block region.
+    pub polygon: Option<Polygon>,
+    /// Classification of this block.
+    pub kind: BlockKind,
+    /// Lines in reading order.
+    pub lines: Vec<Line>,
+}
 
-    /// Returns `true` if the confidence meets or exceeds the given threshold.
-    pub fn meets_threshold(&self, threshold: f64) -> bool {
-        self.confidence.unwrap_or(0.0) >= threshold
-    }
+/// A single page of OCR results.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Page {
+    /// 1-based page number.
+    pub page_number: u32,
+    /// Page width in pixels, when known.
+    pub width: Option<f64>,
+    /// Page height in pixels, when known.
+    pub height: Option<f64>,
+    /// Blocks in reading order.
+    pub blocks: Vec<Block>,
 }
 
-/// Output from an OCR run on a single image.
+/// Complete OCR output for one image/document.
 ///
-/// Groups detected [`ImageRegion`]s together with a [`ContentSource`]
-/// derived from the input image for provenance tracking.
+/// Groups detected text into a hierarchical tree of
+/// [`Page`] → [`Block`] → [`Line`] → [`Word`], together with a
+/// [`ContentSource`] derived from the input image for provenance tracking.
 ///
 /// [`ContentSource`]: nvisy_core::path::ContentSource
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -70,8 +89,8 @@ pub struct ImageOutput {
     ///
     /// [`ContentSource`]: nvisy_core::path::ContentSource
     pub source: ContentSource,
-    /// Text regions detected in the image.
-    pub regions: Vec<ImageRegion>,
+    /// Pages of OCR results.
+    pub pages: Vec<Page>,
 }
 
 impl ImageOutput {
@@ -79,72 +98,59 @@ impl ImageOutput {
     pub fn new(source: ContentSource) -> Self {
         Self {
             source,
-            regions: Vec::new(),
+            pages: Vec::new(),
         }
     }
 
-    /// Insert a region into this output.
-    pub fn insert(&mut self, region: ImageRegion) {
-        self.regions.push(region);
-    }
-
-    /// Number of detected regions.
+    /// Number of pages.
     pub fn len(&self) -> usize {
-        self.regions.len()
+        self.pages.len()
     }
 
-    /// Returns `true` if no regions were detected.
+    /// Returns `true` if no pages or no words were detected.
     pub fn is_empty(&self) -> bool {
-        self.regions.is_empty()
-    }
-
-    /// Iterator over the detected regions.
-    pub fn iter(&self) -> std::slice::Iter<'_, ImageRegion> {
-        self.regions.iter()
-    }
-
-    /// Mutable iterator over the detected regions.
-    pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, ImageRegion> {
-        self.regions.iter_mut()
+        self.pages.is_empty() || self.words().next().is_none()
     }
 
-    /// Retain only regions that satisfy the predicate.
-    pub fn retain(&mut self, f: impl FnMut(&ImageRegion) -> bool) {
-        self.regions.retain(f);
+    /// Flat iterator over all words across all pages/blocks/lines.
+    pub fn words(&self) -> impl Iterator<Item = &Word> {
+        self.pages
+            .iter()
+            .flat_map(|p| &p.blocks)
+            .flat_map(|b| &b.lines)
+            .flat_map(|l| &l.words)
     }
 
-    /// Filter regions that meet the given confidence threshold.
-    pub fn above_threshold(&self, threshold: f64) -> Vec<&ImageRegion> {
-        self.regions
+    /// Flat iterator over all lines.
+    pub fn lines(&self) -> impl Iterator<Item = &Line> {
+        self.pages
             .iter()
-            .filter(|r| r.meets_threshold(threshold))
-            .collect()
+            .flat_map(|p| &p.blocks)
+            .flat_map(|b| &b.lines)
     }
-}
 
-impl<'a> IntoIterator for &'a ImageOutput {
-    type IntoIter = std::slice::Iter<'a, ImageRegion>;
-    type Item = &'a ImageRegion;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.iter()
+    /// Flat iterator over all blocks.
+    pub fn blocks(&self) -> impl Iterator<Item = &Block> {
+        self.pages.iter().flat_map(|p| &p.blocks)
     }
-}
 
-impl<'a> IntoIterator for &'a mut ImageOutput {
-    type IntoIter = std::slice::IterMut<'a, ImageRegion>;
-    type Item = &'a mut ImageRegion;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.iter_mut()
+    /// Full extracted text (pages joined by `\n\n`).
+    pub fn full_text(&self) -> String {
+        self.pages
+            .iter()
+            .map(|p| {
+                p.blocks
+                    .iter()
+                    .map(|b| b.text.as_str())
+                    .collect::<Vec<_>>()
+                    .join("\n")
+            })
+            .collect::<Vec<_>>()
+            .join("\n\n")
     }
-}
-
-impl IntoIterator for ImageOutput {
-    type IntoIter = std::vec::IntoIter<ImageRegion>;
-    type Item = ImageRegion;
 
-    fn into_iter(self) -> Self::IntoIter {
-        self.regions.into_iter()
+    /// Total word count across all pages.
+    pub fn word_count(&self) -> usize {
+        self.words().count()
     }
 }
diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs
index 21547b19..45dda24e 100644
--- a/crates/nvisy-ocr/src/engine/mod.rs
+++ b/crates/nvisy-ocr/src/engine/mod.rs
@@ -28,7 +28,7 @@ use crate::backend::{Backend, ImageInput, ImageOutput, RunParams};
 ///
 /// let image = ImageInput::new(png_bytes, ImageFormat::Png);
 /// let output = engine.run(&image, &RunParams::default()).await?;
-/// println!("{} regions detected", output.len());
+/// println!("{} words detected", output.word_count());
 /// ```
 #[derive(Clone)]
 pub struct OcrEngine {
@@ -57,7 +57,7 @@ impl OcrEngine {
     ))]
     pub async fn run(&self, image: &ImageInput, params: &RunParams) -> Result<ImageOutput, Error> {
         let output = self.backend.run(image, params).await?;
-        tracing::debug!(regions = output.len(), "ocr complete");
+        tracing::debug!(words = output.word_count(), "ocr complete");
         Ok(output)
     }
 
@@ -69,8 +69,8 @@ impl OcrEngine {
         params: &RunParams,
     ) -> Result<Vec<ImageOutput>, Error> {
         let outputs = self.backend.run_batch(images, params).await?;
-        let regions: usize = outputs.iter().map(|o| o.len()).sum();
-        tracing::debug!(regions, "batch ocr complete");
+        let words: usize = outputs.iter().map(|o| o.word_count()).sum();
+        tracing::debug!(words, "batch ocr complete");
         Ok(outputs)
     }
 }
diff --git a/crates/nvisy-ocr/src/lib.rs b/crates/nvisy-ocr/src/lib.rs
index 376c21da..ae0dfbaf 100644
--- a/crates/nvisy-ocr/src/lib.rs
+++ b/crates/nvisy-ocr/src/lib.rs
@@ -10,6 +10,6 @@ pub mod provider;
 pub mod prelude;
 
 pub use backend::{
-    Backend, ImageFormat, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel,
+    Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word,
 };
 pub use engine::{OcrEngine, OcrProvider};
diff --git a/crates/nvisy-ocr/src/prelude.rs b/crates/nvisy-ocr/src/prelude.rs
index 342b0e9b..498466ee 100644
--- a/crates/nvisy-ocr/src/prelude.rs
+++ b/crates/nvisy-ocr/src/prelude.rs
@@ -1,5 +1,7 @@
 //! Convenience re-exports.
 
-pub use crate::backend::{Backend, ImageFormat, ImageInput, ImageOutput, ImageRegion, RunParams};
+pub use crate::backend::{
+    Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+};
 pub use crate::engine::{OcrEngine, OcrProvider};
 pub use crate::provider::*;
diff --git a/crates/nvisy-ocr/src/provider/aws_textract/backend.rs b/crates/nvisy-ocr/src/provider/aws_textract/backend.rs
index 2a113a56..9dcceda7 100644
--- a/crates/nvisy-ocr/src/provider/aws_textract/backend.rs
+++ b/crates/nvisy-ocr/src/provider/aws_textract/backend.rs
@@ -2,6 +2,7 @@
 //!
 //! [`Backend`]: crate::Backend
 
+use std::collections::HashMap;
 use std::fmt;
 
 use hmac::{Hmac, Mac};
@@ -13,7 +14,8 @@ use sha2::{Digest, Sha256};
 
 use super::AwsTextractParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response,
 };
 
 /// [`Backend`] implementation for AWS Textract.
@@ -124,9 +126,18 @@ struct TextractResponse {
 #[serde(rename_all = "PascalCase")]
 struct TextractBlock {
     block_type: String,
+    id: Option<String>,
     text: Option<String>,
     confidence: Option<f64>,
     geometry: Option<TextractGeometry>,
+    relationships: Option<Vec<TextractRelationship>>,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "PascalCase")]
+struct TextractRelationship {
+    r#type: String,
+    ids: Vec<String>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -152,6 +163,30 @@ struct TextractPoint {
     y: f64,
 }
 
+fn extract_geometry(geom: Option<&TextractGeometry>) -> (BoundingBox, Option<Polygon>) {
+    match geom {
+        Some(geom) => {
+            let bbox = geom
+                .bounding_box
+                .as_ref()
+                .map(|b| BoundingBox {
+                    x: b.left,
+                    y: b.top,
+                    width: b.width,
+                    height: b.height,
+                })
+                .unwrap_or_default();
+
+            let polygon = geom.polygon.as_ref().map(|pts| Polygon {
+                vertices: pts.iter().map(|p| Vertex::new(p.x, p.y)).collect(),
+            });
+
+            (bbox, polygon)
+        }
+        None => (BoundingBox::default(), None),
+    }
+}
+
 #[async_trait::async_trait]
 impl Backend for AwsTextractBackend {
     async fn run(&self, image: &ImageInput, params: &RunParams) -> Result<ImageOutput, Error> {
@@ -220,53 +255,116 @@ impl Backend for AwsTextractBackend {
         })?;
 
         let threshold = params.confidence_threshold;
+
+        // Index blocks by ID for relationship lookups.
+        let block_map: HashMap<&str, &TextractBlock> = parsed
+            .blocks
+            .iter()
+            .filter_map(|b| b.id.as_deref().map(|id| (id, b)))
+            .collect();
+
+        fn child_ids(block: &TextractBlock) -> Vec<&str> {
+            block
+                .relationships
+                .as_ref()
+                .into_iter()
+                .flatten()
+                .filter(|r| r.r#type == "CHILD")
+                .flat_map(|r| r.ids.iter().map(|s| s.as_str()))
+                .collect()
+        }
+
         let mut output = ImageOutput::new(image.source.derive());
 
+        // Iterate PAGE blocks; build LINE→WORD tree from relationships.
+        let mut page_number = 0u32;
         for block in &parsed.blocks {
-            if block.block_type != "WORD" {
+            if block.block_type != "PAGE" {
                 continue;
             }
+            page_number += 1;
+
+            let line_ids = child_ids(block);
+            let mut lines = Vec::new();
+
+            for line_id in &line_ids {
+                let line_block = match block_map.get(line_id) {
+                    Some(b) if b.block_type == "LINE" => b,
+                    _ => continue,
+                };
+
+                let word_ids = child_ids(line_block);
+                let mut words = Vec::new();
+
+                for word_id in &word_ids {
+                    let word_block = match block_map.get(word_id) {
+                        Some(b) if b.block_type == "WORD" => b,
+                        _ => continue,
+                    };
+
+                    let text = match &word_block.text {
+                        Some(t) => t.clone(),
+                        None => continue,
+                    };
+
+                    // Textract returns confidence as 0–100; normalise to 0–1.
+                    let confidence = word_block.confidence.unwrap_or(0.0) / 100.0;
+                    if confidence < threshold {
+                        continue;
+                    }
 
-            let text = match &block.text {
-                Some(t) => t.clone(),
-                None => continue,
-            };
-
-            // Textract returns confidence as 0–100; normalise to 0–1.
-            let confidence = block.confidence.unwrap_or(0.0) / 100.0;
-
-            if confidence < threshold {
-                continue;
-            }
+                    let (bbox, polygon) = extract_geometry(word_block.geometry.as_ref());
 
-            let (bbox, polygon) = match &block.geometry {
-                Some(geom) => {
-                    let bbox = geom
-                        .bounding_box
-                        .as_ref()
-                        .map(|b| BoundingBox {
-                            x: b.left,
-                            y: b.top,
-                            width: b.width,
-                            height: b.height,
-                        })
-                        .unwrap_or_default();
-
-                    let polygon = geom.polygon.as_ref().map(|pts| Polygon {
-                        vertices: pts.iter().map(|p| Vertex::new(p.x, p.y)).collect(),
+                    words.push(Word {
+                        text,
+                        confidence: Some(confidence),
+                        bbox,
+                        polygon,
                     });
+                }
 
-                    (bbox, polygon)
+                if words.is_empty() {
+                    continue;
                 }
-                None => (BoundingBox::default(), None),
-            };
-
-            output.insert(ImageRegion {
-                text,
-                confidence: Some(confidence),
-                bbox,
-                polygon,
-                level: Some(TextLevel::Word),
+
+                let line_text = words
+                    .iter()
+                    .map(|w| w.text.as_str())
+                    .collect::<Vec<_>>()
+                    .join(" ");
+                let line_confidence =
+                    line_block.confidence.map(|c| c / 100.0);
+                let (line_bbox, line_polygon) =
+                    extract_geometry(line_block.geometry.as_ref());
+
+                lines.push(Line {
+                    text: line_text,
+                    confidence: line_confidence,
+                    bbox: line_bbox,
+                    polygon: line_polygon,
+                    words,
+                });
+            }
+
+            let block_text = lines
+                .iter()
+                .map(|l| l.text.as_str())
+                .collect::<Vec<_>>()
+                .join("\n");
+            let (page_bbox, _) = extract_geometry(block.geometry.as_ref());
+
+            output.pages.push(Page {
+                page_number,
+                width: Some(page_bbox.width),
+                height: Some(page_bbox.height),
+                blocks: vec![Block {
+                    text: block_text,
+                    confidence: None,
+                    bbox: page_bbox,
+                    polygon: None,
+                    kind: BlockKind::Text,
+                    lines,
+                }],
             });
         }
 
@@ -349,6 +447,70 @@ mod tests {
         assert!((bbox.width - 0.2).abs() < 0.001);
     }
 
+    #[test]
+    fn build_hierarchy_from_relationships() {
+        let json = serde_json::json!({
+            "Blocks": [
+                {
+                    "BlockType": "PAGE",
+                    "Id": "page-1",
+                    "Geometry": {
+                        "BoundingBox": { "Width": 1.0, "Height": 1.0, "Left": 0.0, "Top": 0.0 }
+                    },
+                    "Relationships": [{
+                        "Type": "CHILD",
+                        "Ids": ["line-1"]
+                    }]
+                },
+                {
+                    "BlockType": "LINE",
+                    "Id": "line-1",
+                    "Text": "hello world",
+                    "Confidence": 98.0,
+                    "Geometry": {
+                        "BoundingBox": { "Width": 0.5, "Height": 0.05, "Left": 0.1, "Top": 0.3 }
+                    },
+                    "Relationships": [{
+                        "Type": "CHILD",
+                        "Ids": ["word-1", "word-2"]
+                    }]
+                },
+                {
+                    "BlockType": "WORD",
+                    "Id": "word-1",
+                    "Text": "hello",
+                    "Confidence": 99.0,
+                    "Geometry": {
+                        "BoundingBox": { "Width": 0.2, "Height": 0.05, "Left": 0.1, "Top": 0.3 }
+                    }
+                },
+                {
+                    "BlockType": "WORD",
+                    "Id": "word-2",
+                    "Text": "world",
+                    "Confidence": 97.0,
+                    "Geometry": {
+                        "BoundingBox": { "Width": 0.2, "Height": 0.05, "Left": 0.35, "Top": 0.3 }
+                    }
+                }
+            ]
+        });
+
+        let resp: TextractResponse = serde_json::from_value(json).unwrap();
+        assert_eq!(resp.blocks.len(), 4);
+
+        // Verify the relationship structure.
+        let page = &resp.blocks[0];
+        assert_eq!(page.block_type, "PAGE");
+        let rels = page.relationships.as_ref().unwrap();
+        assert_eq!(rels[0].ids, vec!["line-1"]);
+
+        let line = &resp.blocks[1];
+        assert_eq!(line.block_type, "LINE");
+        let rels = line.relationships.as_ref().unwrap();
+        assert_eq!(rels[0].ids, vec!["word-1", "word-2"]);
+    }
+
     #[test]
     fn format_datetime_known_epoch() {
         // 2024-01-15T11:30:45Z = 1705318245 seconds since epoch
diff --git a/crates/nvisy-ocr/src/provider/azure_docai/backend.rs b/crates/nvisy-ocr/src/provider/azure_docai/backend.rs
index 5abdaab7..33579361 100644
--- a/crates/nvisy-ocr/src/provider/azure_docai/backend.rs
+++ b/crates/nvisy-ocr/src/provider/azure_docai/backend.rs
@@ -5,14 +5,15 @@
 use std::fmt;
 
 use nvisy_core::Error;
-use nvisy_core::math::{Polygon, Vertex};
+use nvisy_core::math::{BoundingBox, Polygon, Vertex};
 use nvisy_http::HttpClient;
 use serde::Deserialize;
 use tokio::time::{Duration, sleep};
 
 use super::AzureDocaiParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response,
 };
 
 /// [`Backend`] implementation for Azure Document Intelligence.
@@ -193,8 +194,10 @@ impl Backend for AzureDocaiBackend {
             None => return Ok(output),
         };
 
-        for page in &result.pages {
-            for word in &page.words {
+        for (page_idx, azure_page) in result.pages.iter().enumerate() {
+            let mut words = Vec::new();
+
+            for word in &azure_page.words {
                 if word.confidence < threshold {
                     continue;
                 }
@@ -216,14 +219,45 @@ impl Backend for AzureDocaiBackend {
                     .map(|p| p.bounding_box())
                     .unwrap_or_default();
 
-                output.insert(ImageRegion {
+                words.push(Word {
                     text: word.content.clone(),
                     confidence: Some(word.confidence),
                     bbox,
                     polygon,
-                    level: Some(TextLevel::Word),
                 });
             }
+
+            let line_text = words
+                .iter()
+                .map(|w| w.text.as_str())
+                .collect::<Vec<_>>()
+                .join(" ");
+            let line_bbox =
+                BoundingBox::enclosing(words.iter().map(|w| &w.bbox));
+
+            let line = Line {
+                text: line_text.clone(),
+                confidence: None,
+                bbox: line_bbox,
+                polygon: None,
+                words,
+            };
+
+            let block = Block {
+                text: line_text,
+                confidence: None,
+                bbox: line_bbox,
+                polygon: None,
+                kind: BlockKind::Text,
+                lines: vec![line],
+            };
+
+            output.pages.push(Page {
+                page_number: (page_idx + 1) as u32,
+                width: None,
+                height: None,
+                blocks: vec![block],
+            });
         }
 
         Ok(output)
diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
index 2fd8e5b6..497cf94e 100644
--- a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
+++ b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
@@ -10,17 +10,17 @@ use serde::Deserialize;
 
 use super::SuryaParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response, image_part,
 };
 
 /// [`Backend`] implementation for Surya OCR.
 ///
 /// Sends images as multipart form data to `{base_url}/ocr` and parses
-/// word-level results into [`ImageRegion`]. Surya returns both a 4-point
-/// polygon and an axis-aligned bounding box in pixel coordinates.
+/// the response into a hierarchical page/block/line/word tree.
+/// Surya's `TextLine` maps directly to [`Line`].
 ///
 /// [`Backend`]: crate::Backend
-/// [`ImageRegion`]: crate::ImageRegxion
 #[derive(Debug)]
 pub struct SuryaBackend {
     client: HttpClient,
@@ -94,9 +94,13 @@ impl Backend for SuryaBackend {
         let threshold = params.confidence_threshold;
         let mut output = ImageOutput::new(image.source.derive());
 
-        for page in &parsed.pages {
-            for line in &page.text_lines {
-                for word in &line.words {
+        for (page_idx, surya_page) in parsed.pages.iter().enumerate() {
+            let mut lines = Vec::new();
+
+            for text_line in &surya_page.text_lines {
+                let mut words = Vec::new();
+
+                for word in &text_line.words {
                     if word.confidence < threshold {
                         continue;
                     }
@@ -118,15 +122,56 @@ impl Backend for SuryaBackend {
                             .collect(),
                     };
 
-                    output.insert(ImageRegion {
+                    words.push(Word {
                         text: word.text.clone(),
                         confidence: Some(word.confidence),
                         bbox,
                         polygon: Some(polygon),
-                        level: Some(TextLevel::Word),
                     });
                 }
+
+                if words.is_empty() {
+                    continue;
+                }
+
+                let line_text = words
+                    .iter()
+                    .map(|w| w.text.as_str())
+                    .collect::<Vec<_>>()
+                    .join(" ");
+                let line_bbox =
+                    BoundingBox::enclosing(words.iter().map(|w| &w.bbox));
+
+                lines.push(Line {
+                    text: line_text,
+                    confidence: None,
+                    bbox: line_bbox,
+                    polygon: None,
+                    words,
+                });
             }
+
+            let block_text = lines
+                .iter()
+                .map(|l| l.text.as_str())
+                .collect::<Vec<_>>()
+                .join("\n");
+            let block_bbox =
+                BoundingBox::enclosing(lines.iter().map(|l| &l.bbox));
+
+            output.pages.push(Page {
+                page_number: (page_idx + 1) as u32,
+                width: None,
+                height: None,
+                blocks: vec![Block {
+                    text: block_text,
+                    confidence: None,
+                    bbox: block_bbox,
+                    polygon: None,
+                    kind: BlockKind::Text,
+                    lines,
+                }],
+            });
         }
 
         Ok(output)
diff --git a/crates/nvisy-ocr/src/provider/google_vision/backend.rs b/crates/nvisy-ocr/src/provider/google_vision/backend.rs
index 41a58ce7..b77817ec 100644
--- a/crates/nvisy-ocr/src/provider/google_vision/backend.rs
+++ b/crates/nvisy-ocr/src/provider/google_vision/backend.rs
@@ -5,19 +5,21 @@
 use std::fmt;
 
 use nvisy_core::Error;
-use nvisy_core::math::{Polygon, Vertex};
+use nvisy_core::math::{BoundingBox, Polygon, Vertex};
 use nvisy_http::HttpClient;
 use serde::Deserialize;
 
 use super::GoogleVisionParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response,
 };
 
 /// [`Backend`] implementation for Google Cloud Vision API.
 ///
 /// Sends images as base64-encoded JSON to the `images:annotate` endpoint
-/// and parses word-level results from the `fullTextAnnotation` response.
+/// and parses the `fullTextAnnotation` response into a hierarchical
+/// page/block/line/word tree.
 ///
 /// [`Backend`]: crate::Backend
 pub struct GoogleVisionBackend {
@@ -104,6 +106,32 @@ struct GvVertex {
     y: Option<i32>,
 }
 
+fn gv_polygon(bp: &GvBoundingPoly) -> Polygon {
+    Polygon {
+        vertices: bp
+            .vertices
+            .iter()
+            .map(|v| {
+                Vertex::new(
+                    f64::from(v.x.unwrap_or(0)),
+                    f64::from(v.y.unwrap_or(0)),
+                )
+            })
+            .collect(),
+    }
+}
+
+fn gv_bbox_polygon(bp: Option<&GvBoundingPoly>) -> (BoundingBox, Option<Polygon>) {
+    match bp {
+        Some(bp) => {
+            let polygon = gv_polygon(bp);
+            let bbox = polygon.bounding_box();
+            (bbox, Some(polygon))
+        }
+        None => (BoundingBox::default(), None),
+    }
+}
+
 #[async_trait::async_trait]
 impl Backend for GoogleVisionBackend {
     async fn run(&self, image: &ImageInput, params: &RunParams) -> Result<ImageOutput, Error> {
@@ -142,51 +170,92 @@ impl Backend for GoogleVisionBackend {
         let threshold = params.confidence_threshold;
         let mut output = ImageOutput::new(image.source.derive());
 
-        for result in &parsed.responses {
+        for (result_idx, result) in parsed.responses.iter().enumerate() {
             let annotation = match &result.full_text_annotation {
                 Some(a) => a,
                 None => continue,
             };
 
-            for page in &annotation.pages {
-                for block in &page.blocks {
-                    for paragraph in &block.paragraphs {
-                        for word in &paragraph.words {
-                            if word.confidence < threshold {
+            for (page_idx, gv_page) in annotation.pages.iter().enumerate() {
+                let mut blocks = Vec::new();
+
+                for gv_block in &gv_page.blocks {
+                    let mut lines = Vec::new();
+
+                    // Each GV paragraph maps to a Line.
+                    for paragraph in &gv_block.paragraphs {
+                        let mut words = Vec::new();
+
+                        for gv_word in &paragraph.words {
+                            if gv_word.confidence < threshold {
                                 continue;
                             }
 
                             let text: String =
-                                word.symbols.iter().map(|s| s.text.as_str()).collect();
-
-                            let polygon = word.bounding_box.as_ref().map(|bp| Polygon {
-                                vertices: bp
-                                    .vertices
-                                    .iter()
-                                    .map(|v| {
-                                        Vertex::new(
-                                            f64::from(v.x.unwrap_or(0)),
-                                            f64::from(v.y.unwrap_or(0)),
-                                        )
-                                    })
-                                    .collect(),
-                            });
+                                gv_word.symbols.iter().map(|s| s.text.as_str()).collect();
 
-                            let bbox = polygon
-                                .as_ref()
-                                .map(|p| p.bounding_box())
-                                .unwrap_or_default();
+                            let (bbox, polygon) =
+                                gv_bbox_polygon(gv_word.bounding_box.as_ref());
 
-                            output.insert(ImageRegion {
+                            words.push(Word {
                                 text,
-                                confidence: Some(word.confidence),
+                                confidence: Some(gv_word.confidence),
                                 bbox,
                                 polygon,
-                                level: Some(TextLevel::Word),
                             });
                         }
+
+                        if words.is_empty() {
+                            continue;
+                        }
+
+                        let line_text = words
+                            .iter()
+                            .map(|w| w.text.as_str())
+                            .collect::<Vec<_>>()
+                            .join(" ");
+                        let line_bbox = BoundingBox::enclosing(
+                            words.iter().map(|w| &w.bbox),
+                        );
+
+                        lines.push(Line {
+                            text: line_text,
+                            confidence: None,
+                            bbox: line_bbox,
+                            polygon: None,
+                            words,
+                        });
                     }
+
+                    if lines.is_empty() {
+                        continue;
+                    }
+
+                    let block_text = lines
+                        .iter()
+                        .map(|l| l.text.as_str())
+                        .collect::<Vec<_>>()
+                        .join("\n");
+                    let block_bbox = BoundingBox::enclosing(
+                        lines.iter().map(|l| &l.bbox),
+                    );
+
+                    blocks.push(Block {
+                        text: block_text,
+                        confidence: None,
+                        bbox: block_bbox,
+                        polygon: None,
+                        kind: BlockKind::Text,
+                        lines,
+                    });
                 }
+
+                output.pages.push(Page {
+                    page_number: (result_idx + page_idx + 1) as u32,
+                    width: None,
+                    height: None,
+                    blocks,
+                });
             }
         }
 
diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
index e5562b9f..14a85203 100644
--- a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
+++ b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
@@ -3,25 +3,24 @@
 //! [`Backend`]: crate::Backend
 
 use nvisy_core::Error;
-use nvisy_core::math::{Polygon, Vertex};
+use nvisy_core::math::{BoundingBox, Polygon, Vertex};
 use nvisy_http::HttpClient;
 use reqwest_middleware::reqwest::multipart::Form;
 use serde::Deserialize;
 
 use super::DoctrParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response, image_part,
 };
 
 /// [`Backend`] implementation for DocTR.
 ///
 /// Sends images as multipart form data to `{base_url}/ocr` and parses
-/// word-level results into [`ImageRegion`]. DocTR returns normalised 0..1
-/// coordinates that are denormalised using the `dimensions` field from
-/// the response.
+/// word-level results into a hierarchical tree. DocTR returns normalised
+/// 0..1 coordinates that are denormalised using the `dimensions` field.
 ///
 /// [`Backend`]: crate::Backend
-/// [`ImageRegion`]: crate::ImageRegion
 #[derive(Debug)]
 pub struct DoctrBackend {
     client: HttpClient,
@@ -90,8 +89,9 @@ impl Backend for DoctrBackend {
         let threshold = params.confidence_threshold;
         let mut output = ImageOutput::new(image.source.derive());
 
-        for page in &parsed.pages {
+        for (page_idx, page) in parsed.pages.iter().enumerate() {
             let [height, width] = page.dimensions;
+            let mut words = Vec::new();
 
             for word in &page.words {
                 if word.confidence < threshold {
@@ -115,14 +115,45 @@ impl Backend for DoctrBackend {
                 };
                 let bbox = polygon.bounding_box();
 
-                output.insert(ImageRegion {
+                words.push(Word {
                     text: word.value.clone(),
                     confidence: Some(word.confidence),
                     bbox,
                     polygon: Some(polygon),
-                    level: Some(TextLevel::Word),
                 });
             }
+
+            let line_text = words
+                .iter()
+                .map(|w| w.text.as_str())
+                .collect::<Vec<_>>()
+                .join(" ");
+            let line_bbox =
+                BoundingBox::enclosing(words.iter().map(|w| &w.bbox));
+
+            let line = Line {
+                text: line_text.clone(),
+                confidence: None,
+                bbox: line_bbox,
+                polygon: None,
+                words,
+            };
+
+            let block = Block {
+                text: line_text,
+                confidence: None,
+                bbox: line_bbox,
+                polygon: None,
+                kind: BlockKind::Text,
+                lines: vec![line],
+            };
+
+            output.pages.push(Page {
+                page_number: (page_idx + 1) as u32,
+                width: Some(width),
+                height: Some(height),
+                blocks: vec![block],
+            });
         }
 
         Ok(output)
diff --git a/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs b/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs
index 2dd8236e..5ca411ac 100644
--- a/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs
+++ b/crates/nvisy-ocr/src/provider/paddle_paddlex/backend.rs
@@ -3,23 +3,24 @@
 //! [`Backend`]: crate::Backend
 
 use nvisy_core::Error;
-use nvisy_core::math::{Polygon, Vertex};
+use nvisy_core::math::{BoundingBox, Polygon, Vertex};
 use nvisy_http::HttpClient;
 use reqwest_middleware::reqwest::multipart::Form;
 use serde::Deserialize;
 
 use super::PaddleXParams;
 use crate::backend::{
-    Backend, ImageInput, ImageOutput, ImageRegion, RunParams, TextLevel, check_response, image_part,
+    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
+    check_response, image_part,
 };
 
 /// [`Backend`] implementation for PaddleX PP-OCRv5.
 ///
 /// Sends images as multipart form data to `{base_url}/ocr` with
-/// `returnWordBox=true` and parses word-level results into [`ImageRegion`].
+/// `returnWordBox=true` and parses word-level results into a
+/// hierarchical page/block/line/word tree.
 ///
 /// [`Backend`]: crate::Backend
-/// [`ImageRegion`]: crate::ImageRegion
 #[derive(Debug)]
 pub struct PaddleXBackend {
     client: HttpClient,
@@ -99,7 +100,11 @@ impl Backend for PaddleXBackend {
         let threshold = params.confidence_threshold;
         let mut output = ImageOutput::new(image.source.derive());
 
+        let mut lines = Vec::new();
+
         for ocr_result in &parsed.result.ocr_results {
+            let mut words = Vec::new();
+
             for word in &ocr_result.word_results {
                 if word.confidence < threshold {
                     continue;
@@ -114,16 +119,57 @@ impl Backend for PaddleXBackend {
                 };
                 let bbox = polygon.bounding_box();
 
-                output.insert(ImageRegion {
+                words.push(Word {
                     text: word.text.clone(),
                     confidence: Some(word.confidence),
                     bbox,
                     polygon: Some(polygon),
-                    level: Some(TextLevel::Word),
                 });
             }
+
+            if words.is_empty() {
+                continue;
+            }
+
+            let line_text = words
+                .iter()
+                .map(|w| w.text.as_str())
+                .collect::<Vec<_>>()
+                .join(" ");
+            let line_bbox =
+                BoundingBox::enclosing(words.iter().map(|w| &w.bbox));
+
+            lines.push(Line {
+                text: line_text,
+                confidence: None,
+                bbox: line_bbox,
+                polygon: None,
+                words,
+            });
         }
 
+        let block_text = lines
+            .iter()
+            .map(|l| l.text.as_str())
+            .collect::<Vec<_>>()
+            .join("\n");
+        let block_bbox =
+            BoundingBox::enclosing(lines.iter().map(|l| &l.bbox));
+
+        output.pages.push(Page {
+            page_number: 1,
+            width: None,
+            height: None,
+            blocks: vec![Block {
+                text: block_text,
+                confidence: None,
+                bbox: block_bbox,
+                polygon: None,
+                kind: BlockKind::Text,
+                lines,
+            }],
+        });
+
         Ok(output)
     }
 }

From fc45db3c9bff2a09472a86e0cac88cd70371384f Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Sun, 8 Mar 2026 07:51:44 +0100
Subject: [PATCH 2/8] style(core): format BoundingBox derive attributes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crates/nvisy-core/src/math/bounding_box.rs | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs
index 29ec65a5..722f96da 100644
--- a/crates/nvisy-core/src/math/bounding_box.rs
+++ b/crates/nvisy-core/src/math/bounding_box.rs
@@ -8,16 +8,8 @@ use serde::{Deserialize, Serialize};
 /// Coordinates are `f64` to support both pixel and normalized (0.0–1.0)
 /// values from detection models. Use [`BoundingBoxU32`] (or [`Into`])
 /// when integer pixel coordinates are needed for rendering.
-#[derive(
-    Debug,
-    Clone,
-    Copy,
-    Default,
-    PartialEq,
-    Serialize,
-    Deserialize,
-    JsonSchema
-)]
+#[derive(Debug, Clone, Copy, Default, PartialEq)]
+#[derive(Serialize, Deserialize, JsonSchema)]
 pub struct BoundingBox {
     /// Horizontal offset of the top-left corner (pixels or normalized).
     pub x: f64,

From c107dac8f528a7e3e56669e65548da2ec0fe43ed Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Sun, 8 Mar 2026 07:59:07 +0100
Subject: [PATCH 3/8] refactor(ocr): remove DocTR provider

Delete the mindee_doctr backend, params, and module. Update OcrProvider
enum, doc examples, and README to reflect five remaining providers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crates/nvisy-ocr/README.md                    |   9 +-
 crates/nvisy-ocr/src/engine/mod.rs            |   4 +-
 crates/nvisy-ocr/src/engine/params.rs         |   8 +-
 .../src/provider/mindee_doctr/backend.rs      | 220 ------------------
 .../src/provider/mindee_doctr/mod.rs          |  10 -
 .../src/provider/mindee_doctr/params.rs       |  10 -
 crates/nvisy-ocr/src/provider/mod.rs          |   2 -
 7 files changed, 7 insertions(+), 256 deletions(-)
 delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
 delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs
 delete mode 100644 crates/nvisy-ocr/src/provider/mindee_doctr/params.rs

diff --git a/crates/nvisy-ocr/README.md b/crates/nvisy-ocr/README.md
index 9dcd0459..6ecfe65d 100644
--- a/crates/nvisy-ocr/README.md
+++ b/crates/nvisy-ocr/README.md
@@ -4,11 +4,10 @@
 
 OCR backend trait, type-erased engine, and provider implementations for the Nvisy runtime.
 
-Defines the `Backend` trait for text extraction from images and ships six
+Defines the `Backend` trait for text extraction from images and ships five
 provider implementations across local and cloud services:
 
 **Local** (always available):
-- `DoctrBackend`: DocTR server (multipart upload, normalised coordinates)
 - `PaddleXBackend`: PaddleX PP-OCRv5 server (multipart upload, word-level boxes)
 - `SuryaBackend`: Surya OCR server (multipart upload, pixel coordinates)
 
@@ -17,9 +16,9 @@ provider implementations across local and cloud services:
 - `GoogleVisionBackend`: Google Cloud Vision API (`google-vision` feature)
 - `AzureDocaiBackend`: Azure Document Intelligence with async polling (`azure-docai` feature)
 
-Every backend returns `ImageOutput` containing a list of `ImageRegion`s,
-each with extracted text, optional confidence score, bounding box, polygon
-vertices for rotated text, and hierarchical text-level annotations.
+Every backend returns `ImageOutput` containing a hierarchical tree of
+`Page` → `Block` → `Line` → `Word`, each with extracted text, optional
+confidence score, bounding box, and polygon vertices for rotated text.
 
 The `Engine` wrapper provides a type-erased entry point with built-in
 `tracing` instrumentation for request-level observability.
diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs
index 45dda24e..6a6c8024 100644
--- a/crates/nvisy-ocr/src/engine/mod.rs
+++ b/crates/nvisy-ocr/src/engine/mod.rs
@@ -21,9 +21,9 @@ use crate::backend::{Backend, ImageInput, ImageOutput, RunParams};
 ///
 /// ```ignore
 /// use nvisy_ocr::{OcrEngine, ImageInput, ImageFormat, RunParams};
-/// use nvisy_ocr::provider::{DoctrBackend, DoctrParams};
+/// use nvisy_ocr::provider::{SuryaBackend, SuryaParams};
 ///
-/// let backend = DoctrBackend::new(DoctrParams { base_url: "http://localhost:8000".into() });
+/// let backend = SuryaBackend::new(SuryaParams { base_url: "http://localhost:8000".into() });
 /// let engine = OcrEngine::new(backend);
 ///
 /// let image = ImageInput::new(png_bytes, ImageFormat::Png);
diff --git a/crates/nvisy-ocr/src/engine/params.rs b/crates/nvisy-ocr/src/engine/params.rs
index 61f92e98..b9fcd0d5 100644
--- a/crates/nvisy-ocr/src/engine/params.rs
+++ b/crates/nvisy-ocr/src/engine/params.rs
@@ -5,9 +5,7 @@ use serde::{Deserialize, Serialize};
 use crate::provider::{AwsTextractBackend, AwsTextractParams};
 #[cfg(feature = "azure-docai")]
 use crate::provider::{AzureDocaiBackend, AzureDocaiParams};
-use crate::provider::{
-    DoctrBackend, DoctrParams, PaddleXBackend, PaddleXParams, SuryaBackend, SuryaParams,
-};
+use crate::provider::{PaddleXBackend, PaddleXParams, SuryaBackend, SuryaParams};
 #[cfg(feature = "google-vision")]
 use crate::provider::{GoogleVisionBackend, GoogleVisionParams};
 
@@ -21,8 +19,6 @@ use crate::provider::{GoogleVisionBackend, GoogleVisionParams};
 pub enum OcrProvider {
     /// Datalab Surya OCR.
     Surya(SuryaParams),
-    /// Mindee DocTR.
-    Doctr(DoctrParams),
     /// PaddlePaddle PaddleX PP-OCRv5.
     PaddleX(PaddleXParams),
     /// AWS Textract.
@@ -44,7 +40,6 @@ impl OcrProvider {
     pub fn into_engine(self) -> super::OcrEngine {
         match self {
             Self::Surya(p) => super::OcrEngine::new(SuryaBackend::new(p)),
-            Self::Doctr(p) => super::OcrEngine::new(DoctrBackend::new(p)),
             Self::PaddleX(p) => super::OcrEngine::new(PaddleXBackend::new(p)),
             #[cfg(feature = "aws-textract")]
             Self::AwsTextract(p) => super::OcrEngine::new(AwsTextractBackend::new(p)),
@@ -62,7 +57,6 @@ impl OcrProvider {
     pub fn into_engine_with_client(self, client: HttpClient) -> super::OcrEngine {
         match self {
             Self::Surya(p) => super::OcrEngine::new(SuryaBackend::with_client(client, p)),
-            Self::Doctr(p) => super::OcrEngine::new(DoctrBackend::with_client(client, p)),
             Self::PaddleX(p) => super::OcrEngine::new(PaddleXBackend::with_client(client, p)),
             #[cfg(feature = "aws-textract")]
             Self::AwsTextract(p) => {
diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
deleted file mode 100644
index 14a85203..00000000
--- a/crates/nvisy-ocr/src/provider/mindee_doctr/backend.rs
+++ /dev/null
@@ -1,220 +0,0 @@
-//! [`Backend`] implementation for DocTR.
-//!
-//! [`Backend`]: crate::Backend
-
-use nvisy_core::Error;
-use nvisy_core::math::{BoundingBox, Polygon, Vertex};
-use nvisy_http::HttpClient;
-use reqwest_middleware::reqwest::multipart::Form;
-use serde::Deserialize;
-
-use super::DoctrParams;
-use crate::backend::{
-    Backend, Block, BlockKind, ImageInput, ImageOutput, Line, Page, RunParams, Word,
-    check_response, image_part,
-};
-
-/// [`Backend`] implementation for DocTR.
-///
-/// Sends images as multipart form data to `{base_url}/ocr` and parses
-/// word-level results into a hierarchical tree. DocTR returns normalised
-/// 0..1 coordinates that are denormalised using the `dimensions` field.
-///
-/// [`Backend`]: crate::Backend
-#[derive(Debug)]
-pub struct DoctrBackend {
-    client: HttpClient,
-    base_url: String,
-}
-
-impl DoctrBackend {
-    /// Create a new backend with default HTTP configuration.
-    pub fn new(params: DoctrParams) -> Self {
-        Self::with_client(HttpClient::default(), params)
-    }
-
-    /// Create a new backend with a pre-configured HTTP client.
-    pub fn with_client(client: HttpClient, params: DoctrParams) -> Self {
-        Self {
-            client,
-            base_url: params.base_url,
-        }
-    }
-}
-
-#[derive(Debug, Deserialize)]
-struct DoctrResponse {
-    pages: Vec<DoctrPage>,
-}
-
-#[derive(Debug, Deserialize)]
-struct DoctrPage {
-    /// `[height, width]` in pixels.
-    dimensions: [f64; 2],
-    words: Vec<DoctrWord>,
-}
-
-#[derive(Debug, Deserialize)]
-struct DoctrWord {
-    value: String,
-    #[serde(default)]
-    confidence: f64,
-    /// `[[x_min, y_min], [x_max, y_max]]` in normalised 0–1 coords.
-    geometry: [[f64; 2]; 2],
-}
-
-#[async_trait::async_trait]
-impl Backend for DoctrBackend {
-    async fn run(&self, image: &ImageInput, params: &RunParams) -> Result<ImageOutput, Error> {
-        let file_part = image_part(image)?;
-
-        let form = Form::new().part("file", file_part);
-
-        let url = format!("{}/ocr", self.base_url.trim_end_matches('/'));
-
-        let resp = self
-            .client
-            .post(&url)
-            .multipart(form)
-            .send()
-            .await
-            .map_err(|e| Error::connection(e.to_string(), "doctr_ocr", true))?;
-
-        let resp = check_response(resp, "DocTR").await?;
-
-        let parsed: DoctrResponse = resp.json().await.map_err(|e| {
-            Error::runtime(format!("DocTR JSON parse error: {e}"), "doctr_ocr", false)
-        })?;
-
-        let threshold = params.confidence_threshold;
-        let mut output = ImageOutput::new(image.source.derive());
-
-        for (page_idx, page) in parsed.pages.iter().enumerate() {
-            let [height, width] = page.dimensions;
-            let mut words = Vec::new();
-
-            for word in &page.words {
-                if word.confidence < threshold {
-                    continue;
-                }
-
-                let [[x_min_n, y_min_n], [x_max_n, y_max_n]] = word.geometry;
-
-                let x_min = x_min_n * width;
-                let y_min = y_min_n * height;
-                let x_max = x_max_n * width;
-                let y_max = y_max_n * height;
-
-                let polygon = Polygon {
-                    vertices: vec![
-                        Vertex::new(x_min, y_min), // TL
-                        Vertex::new(x_max, y_min), // TR
-                        Vertex::new(x_max, y_max), // BR
-                        Vertex::new(x_min, y_max), // BL
-                    ],
-                };
-                let bbox = polygon.bounding_box();
-
-                words.push(Word {
-                    text: word.value.clone(),
-                    confidence: Some(word.confidence),
-                    bbox,
-                    polygon: Some(polygon),
-                });
-            }
-
-            let line_text = words
-                .iter()
-                .map(|w| w.text.as_str())
-                .collect::<Vec<_>>()
-                .join(" ");
-            let line_bbox =
-                BoundingBox::enclosing(words.iter().map(|w| &w.bbox));
-
-            let line = Line {
-                text: line_text.clone(),
-                confidence: None,
-                bbox: line_bbox,
-                polygon: None,
-                words,
-            };
-
-            let block = Block {
-                text: line_text,
-                confidence: None,
-                bbox: line_bbox,
-                polygon: None,
-                kind: BlockKind::Text,
-                lines: vec![line],
-            };
-
-            output.pages.push(Page {
-                page_number: (page_idx + 1) as u32,
-                width: Some(width),
-                height: Some(height),
-                blocks: vec![block],
-            });
-        }
-
-        Ok(output)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn parse_response() {
-        let json = serde_json::json!({
-            "pages": [{
-                "dimensions": [1000.0, 2000.0],
-                "words": [
-                    {
-                        "value": "hello",
-                        "confidence": 0.97,
-                        "geometry": [[0.05, 0.10], [0.15, 0.14]]
-                    },
-                    {
-                        "value": "world",
-                        "confidence": 0.95,
-                        "geometry": [[0.20, 0.10], [0.30, 0.14]]
-                    }
-                ]
-            }]
-        });
-
-        let resp: DoctrResponse = serde_json::from_value(json).unwrap();
-        assert_eq!(resp.pages.len(), 1);
-        assert_eq!(resp.pages[0].words.len(), 2);
-
-        let page = &resp.pages[0];
-        let [height, width] = page.dimensions;
-        let word = &page.words[0];
-
-        // Denormalise: x_min = 0.05 * 2000 = 100, y_min = 0.10 * 1000 = 100
-        let x_min = word.geometry[0][0] * width;
-        let y_min = word.geometry[0][1] * height;
-        let x_max = word.geometry[1][0] * width;
-        let y_max = word.geometry[1][1] * height;
-
-        assert!((x_min - 100.0).abs() < 0.01);
-        assert!((y_min - 100.0).abs() < 0.01);
-        assert!((x_max - 300.0).abs() < 0.01);
-        assert!((y_max - 140.0).abs() < 0.01);
-
-        let polygon = Polygon {
-            vertices: vec![
-                Vertex::new(x_min, y_min),
-                Vertex::new(x_max, y_min),
-                Vertex::new(x_max, y_max),
-                Vertex::new(x_min, y_max),
-            ],
-        };
-        let bbox = polygon.bounding_box();
-        assert!((bbox.x - 100.0).abs() < 0.01);
-        assert!((bbox.y - 100.0).abs() < 0.01);
-        assert!((bbox.width - 200.0).abs() < 0.01);
-        assert!((bbox.height - 40.0).abs() < 0.01);
-    }
-}
diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs
deleted file mode 100644
index 338f49dd..00000000
--- a/crates/nvisy-ocr/src/provider/mindee_doctr/mod.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-//! DocTR OCR backend.
-//!
-//! Sends images as multipart form data to a DocTR server and parses
-//! word-level results with normalised-to-pixel coordinate conversion.
-
-mod backend;
-mod params;
-
-pub use backend::DoctrBackend;
-pub use params::DoctrParams;
diff --git a/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs b/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs
deleted file mode 100644
index ec73c615..00000000
--- a/crates/nvisy-ocr/src/provider/mindee_doctr/params.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-/// Constructor parameters for [`DoctrBackend`].
-///
-/// [`DoctrBackend`]: super::DoctrBackend
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct DoctrParams {
-    /// Base URL of the DocTR server.
-    pub base_url: String,
-}
diff --git a/crates/nvisy-ocr/src/provider/mod.rs b/crates/nvisy-ocr/src/provider/mod.rs
index dd24c590..bf4b7831 100644
--- a/crates/nvisy-ocr/src/provider/mod.rs
+++ b/crates/nvisy-ocr/src/provider/mod.rs
@@ -1,11 +1,9 @@
 //! All OCR backend implementations and their parameter types.
 
 mod datalab_surya;
-mod mindee_doctr;
 mod paddle_paddlex;
 
 pub use datalab_surya::{SuryaBackend, SuryaParams};
-pub use mindee_doctr::{DoctrBackend, DoctrParams};
 pub use paddle_paddlex::{PaddleXBackend, PaddleXParams};
 
 #[cfg(feature = "aws-textract")]

From 43c9dd38fbb7c4ff7d8042e22b3513a6262dce41 Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Sun, 8 Mar 2026 08:02:36 +0100
Subject: [PATCH 4/8] refactor(core): rename BoundingBoxU32 to BoundingBoxPixel

Also renames to_u32() to to_pixel() for consistency.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crates/nvisy-codec/src/transform/image/ops.rs  | 14 +++++++-------
 .../src/transform/image/transform.rs           |  2 +-
 crates/nvisy-core/src/math/bounding_box.rs     | 18 +++++++++---------
 crates/nvisy-core/src/math/mod.rs              |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/crates/nvisy-codec/src/transform/image/ops.rs b/crates/nvisy-codec/src/transform/image/ops.rs
index 47547213..89dbb22e 100644
--- a/crates/nvisy-codec/src/transform/image/ops.rs
+++ b/crates/nvisy-codec/src/transform/image/ops.rs
@@ -6,22 +6,22 @@
 use image::DynamicImage;
 use image::imageops::FilterType;
 use imageproc::filter::gaussian_blur_f32;
-use nvisy_core::math::BoundingBoxU32;
+use nvisy_core::math::BoundingBoxPixel;
 
 /// Mutating image-transform operations on individual bounding-box regions.
 pub trait ImageOps {
     /// Apply a gaussian blur to `region` with the given `sigma`.
-    fn apply_gaussian_blur(&mut self, region: &BoundingBoxU32, sigma: f32);
+    fn apply_gaussian_blur(&mut self, region: &BoundingBoxPixel, sigma: f32);
 
     /// Fill `region` with a solid RGBA `color`.
-    fn apply_block_overlay(&mut self, region: &BoundingBoxU32, color: [u8; 4]);
+    fn apply_block_overlay(&mut self, region: &BoundingBoxPixel, color: [u8; 4]);
 
     /// Pixelate `region` with the given `block_size`.
-    fn apply_pixelate(&mut self, region: &BoundingBoxU32, block_size: u32);
+    fn apply_pixelate(&mut self, region: &BoundingBoxPixel, block_size: u32);
 }
 
 impl ImageOps for DynamicImage {
-    fn apply_gaussian_blur(&mut self, region: &BoundingBoxU32, sigma: f32) {
+    fn apply_gaussian_blur(&mut self, region: &BoundingBoxPixel, sigma: f32) {
         let (x, y, w, h) = (region.x, region.y, region.width, region.height);
 
         let img_w = self.width();
@@ -40,7 +40,7 @@ impl ImageOps for DynamicImage {
         image::imageops::overlay(self, &blurred, x as i64, y as i64);
     }
 
-    fn apply_block_overlay(&mut self, region: &BoundingBoxU32, color: [u8; 4]) {
+    fn apply_block_overlay(&mut self, region: &BoundingBoxPixel, color: [u8; 4]) {
         let (x, y, w, h) = (region.x, region.y, region.width, region.height);
 
         let img_w = self.width();
@@ -55,7 +55,7 @@ impl ImageOps for DynamicImage {
         image::imageops::overlay(self, &block, x as i64, y as i64);
     }
 
-    fn apply_pixelate(&mut self, region: &BoundingBoxU32, block_size: u32) {
+    fn apply_pixelate(&mut self, region: &BoundingBoxPixel, block_size: u32) {
         let block_size = block_size.max(1);
         let (x, y, w, h) = (region.x, region.y, region.width, region.height);
 
diff --git a/crates/nvisy-codec/src/transform/image/transform.rs b/crates/nvisy-codec/src/transform/image/transform.rs
index 8b745c11..e7f2c160 100644
--- a/crates/nvisy-codec/src/transform/image/transform.rs
+++ b/crates/nvisy-codec/src/transform/image/transform.rs
@@ -51,7 +51,7 @@ where
         let mut img: DynamicImage = image_data.into_inner();
 
         for redaction in redactions {
-            let region = redaction.bounding_box.to_u32();
+            let region = redaction.bounding_box.to_pixel();
             match &redaction.output {
                 ImageOutput::Blur { sigma } => {
                     img.apply_gaussian_blur(&region, *sigma);
diff --git a/crates/nvisy-core/src/math/bounding_box.rs b/crates/nvisy-core/src/math/bounding_box.rs
index 722f96da..70ccffed 100644
--- a/crates/nvisy-core/src/math/bounding_box.rs
+++ b/crates/nvisy-core/src/math/bounding_box.rs
@@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
 /// Axis-aligned bounding box for image-based entity locations.
 ///
 /// Coordinates are `f64` to support both pixel and normalized (0.0–1.0)
-/// values from detection models. Use [`BoundingBoxU32`] (or [`Into`])
+/// values from detection models. Use [`BoundingBoxPixel`] (or [`Into`])
 /// when integer pixel coordinates are needed for rendering.
 #[derive(Debug, Clone, Copy, Default, PartialEq)]
 #[derive(Serialize, Deserialize, JsonSchema)]
@@ -111,8 +111,8 @@ impl BoundingBox {
     }
 
     /// Convert to integer pixel coordinates by rounding each field.
-    pub fn to_u32(&self) -> BoundingBoxU32 {
-        BoundingBoxU32 {
+    pub fn to_pixel(&self) -> BoundingBoxPixel {
+        BoundingBoxPixel {
             x: self.x.round() as u32,
             y: self.y.round() as u32,
             width: self.width.round() as u32,
@@ -127,7 +127,7 @@ impl BoundingBox {
 /// integer. Use this at the rendering boundary where pixel-exact
 /// coordinates are required.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct BoundingBoxU32 {
+pub struct BoundingBoxPixel {
     /// Horizontal offset of the top-left corner in pixels.
     pub x: u32,
     /// Vertical offset of the top-left corner in pixels.
@@ -138,13 +138,13 @@ pub struct BoundingBoxU32 {
     pub height: u32,
 }
 
-impl From<&BoundingBox> for BoundingBoxU32 {
+impl From<&BoundingBox> for BoundingBoxPixel {
     fn from(bb: &BoundingBox) -> Self {
-        bb.to_u32()
+        bb.to_pixel()
     }
 }
 
-impl From<BoundingBox> for BoundingBoxU32 {
+impl From<BoundingBox> for BoundingBoxPixel {
     fn from(bb: BoundingBox) -> Self {
         Self::from(&bb)
     }
@@ -221,9 +221,9 @@ mod tests {
     }
 
     #[test]
-    fn to_u32_rounds() {
+    fn to_pixel_rounds() {
         let bb = BoundingBox::new(1.4, 2.6, 3.5, 4.4);
-        let u = bb.to_u32();
+        let u = bb.to_pixel();
         assert_eq!(u.x, 1);
         assert_eq!(u.y, 3);
         assert_eq!(u.width, 4);
diff --git a/crates/nvisy-core/src/math/mod.rs b/crates/nvisy-core/src/math/mod.rs
index 0bd098be..0c677122 100644
--- a/crates/nvisy-core/src/math/mod.rs
+++ b/crates/nvisy-core/src/math/mod.rs
@@ -8,7 +8,7 @@ mod dpi;
 mod polygon;
 mod time_span;
 
-pub use bounding_box::{BoundingBox, BoundingBoxU32};
+pub use bounding_box::{BoundingBox, BoundingBoxPixel};
 pub use dpi::Dpi;
 pub use polygon::{Polygon, Vertex};
 pub use time_span::TimeSpan;

From 9776ee728cadd753210e9d4d392abfc3cf5d8faa Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Sun, 8 Mar 2026 16:05:16 +0100
Subject: [PATCH 5/8] feat(ocr): parse page number and image bbox from Surya
 response

Use the upstream `page` and `image_bbox` fields instead of deriving
page number from enumeration index and leaving dimensions empty.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/provider/datalab_surya/backend.rs        | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
index 497cf94e..af10c159 100644
--- a/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
+++ b/crates/nvisy-ocr/src/provider/datalab_surya/backend.rs
@@ -49,6 +49,10 @@ struct SuryaResponse {
 
 #[derive(Debug, Deserialize)]
 struct SuryaPage {
+    /// Upstream page number (0-based).
+    page: u32,
+    /// Document image bounds `[x_min, y_min, x_max, y_max]`.
+    image_bbox: [f64; 4],
     text_lines: Vec<SuryaTextLine>,
 }
 
@@ -94,7 +98,7 @@ impl Backend for SuryaBackend {
         let threshold = params.confidence_threshold;
         let mut output = ImageOutput::new(image.source.derive());
 
-        for (page_idx, surya_page) in parsed.pages.iter().enumerate() {
+        for surya_page in &parsed.pages {
             let mut lines = Vec::new();
 
             for text_line in &surya_page.text_lines {
@@ -159,10 +163,12 @@ impl Backend for SuryaBackend {
             let block_bbox =
                 BoundingBox::enclosing(lines.iter().map(|l| &l.bbox));
 
+            let [_x_min, _y_min, x_max, y_max] = surya_page.image_bbox;
+
             output.pages.push(Page {
-                page_number: (page_idx + 1) as u32,
-                width: None,
-                height: None,
+                page_number: surya_page.page + 1,
+                width: Some(x_max),
+                height: Some(y_max),
                 blocks: vec![Block {
                     text: block_text,
                     confidence: None,
@@ -186,6 +192,8 @@ mod tests {
     fn parse_response() {
         let json = serde_json::json!({
             "pages": [{
+                "page": 0,
+                "image_bbox": [0.0, 0.0, 800.0, 600.0],
                 "text_lines": [{
                     "words": [
                         {

From 8a9c941da0a1d62ddd14fb343ed77fad3f8aec13 Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Mon, 9 Mar 2026 06:11:25 +0100
Subject: [PATCH 6/8] refactor(core): split DocumentType into typed sub-enums

Replace the flat DocumentType enum with nested format-specific enums:
ImageFormat, WordFormat, PresentationFormat, SpreadsheetFormat,
AudioFormat, and TextFormat. Pdf and Html remain standalone variants.

Each sub-enum owns its own from_mime/mime_type methods, keeping
DocumentType::from_mime as a concise chain of delegates. Remove
Archive from ContentKind. Unify nvisy-ocr's ImageFormat with the
new core ImageFormat.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Cargo.lock                                    |   1 -
 crates/nvisy-codec/src/document/any.rs        |   8 +-
 crates/nvisy-codec/src/document/mod.rs        |   5 +-
 .../src/handler/audio/audio_handler.rs        |  10 +-
 .../src/handler/audio/mp3_handler.rs          |   4 +-
 .../src/handler/audio/wav_handler.rs          |   4 +-
 .../src/handler/image/image_handler.rs        |  10 +-
 .../src/handler/image/jpeg_handler.rs         |   2 +-
 .../src/handler/image/png_handler.rs          |   2 +-
 .../src/handler/rich/docx_handler.rs          |   4 +-
 .../src/handler/text/csv_handler.rs           |   4 +-
 .../src/handler/text/csv_loader.rs            |   4 +-
 .../src/handler/text/json_handler.rs          |   4 +-
 .../src/handler/text/json_loader.rs           |   4 +-
 .../src/handler/text/text_handler.rs          |   5 +-
 .../src/handler/text/txt_handler.rs           |   4 +-
 .../src/handler/text/txt_loader.rs            |   4 +-
 .../src/handler/text/xlsx_handler.rs          |   4 +-
 crates/nvisy-core/src/fs/content_kind.rs      |  12 -
 crates/nvisy-core/src/fs/document_type.rs     | 361 ++++++++++++------
 crates/nvisy-core/src/fs/mod.rs               |   5 +-
 crates/nvisy-ocr/Cargo.toml                   |   3 -
 crates/nvisy-ocr/src/backend/input.rs         |  32 +-
 crates/nvisy-ocr/src/backend/mod.rs           |   3 +-
 24 files changed, 311 insertions(+), 188 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index da0dafc0..cbbbb4d8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2965,7 +2965,6 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
- "strum",
  "tokio",
  "tracing",
 ]
diff --git a/crates/nvisy-codec/src/document/any.rs b/crates/nvisy-codec/src/document/any.rs
index 521e3ed6..47130b6f 100644
--- a/crates/nvisy-codec/src/document/any.rs
+++ b/crates/nvisy-codec/src/document/any.rs
@@ -170,6 +170,8 @@ impl From<Document<DocxHandler>> for AnyDocument {
 
 #[cfg(test)]
 mod tests {
+    use nvisy_core::fs::{AudioFormat, ImageFormat, TextFormat};
+
     use super::*;
 
     #[test]
@@ -177,7 +179,7 @@ mod tests {
         let doc = Document::new(TxtHandler::new(vec!["hello".into()], false));
         let any: AnyDocument = doc.into();
         assert!(any.as_text().is_some());
-        assert_eq!(any.document_type(), DocumentType::Txt);
+        assert_eq!(any.document_type(), DocumentType::Text(TextFormat::Txt));
     }
 
     #[test]
@@ -186,7 +188,7 @@ mod tests {
         let doc = Document::new(PngHandler::new(img));
         let any: AnyDocument = doc.into();
         assert!(any.as_image().is_some());
-        assert_eq!(any.document_type(), DocumentType::Png);
+        assert_eq!(any.document_type(), DocumentType::Image(ImageFormat::Png));
     }
 
     #[test]
@@ -194,7 +196,7 @@ mod tests {
         let doc = Document::new(WavHandler::new(bytes::Bytes::from_static(b"wav")));
         let any: AnyDocument = doc.into();
         assert!(any.as_audio().is_some());
-        assert_eq!(any.document_type(), DocumentType::Wav);
+        assert_eq!(any.document_type(), DocumentType::Audio(AudioFormat::Wav));
     }
 
     #[test]
diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs
index 2ec54b8e..44f86315 100644
--- a/crates/nvisy-codec/src/document/mod.rs
+++ b/crates/nvisy-codec/src/document/mod.rs
@@ -178,7 +178,10 @@ mod tests {
     fn document_type_delegates() {
         let handler = TxtHandler::new(vec![], false);
         let doc = Document::new(handler);
-        assert_eq!(doc.document_type(), DocumentType::Txt);
+        assert_eq!(
+            doc.document_type(),
+            DocumentType::Text(nvisy_core::fs::TextFormat::Txt),
+        );
     }
 
     #[test]
diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler.rs b/crates/nvisy-codec/src/handler/audio/audio_handler.rs
index b7a4e118..e6f3a6ad 100644
--- a/crates/nvisy-codec/src/handler/audio/audio_handler.rs
+++ b/crates/nvisy-codec/src/handler/audio/audio_handler.rs
@@ -104,7 +104,10 @@ mod tests {
     #[tokio::test]
     async fn wav_variant_delegates() {
         let h = AnyAudio::Wav(WavHandler::new(bytes::Bytes::from_static(b"wav-data")));
-        assert_eq!(h.document_type(), DocumentType::Wav);
+        assert_eq!(
+            h.document_type(),
+            DocumentType::Audio(nvisy_core::fs::AudioFormat::Wav),
+        );
         let spans: Vec<_> = h.audio_spans().await.collect().await;
         assert_eq!(spans.len(), 1);
         assert_eq!(spans[0].data.as_bytes().as_ref(), b"wav-data");
@@ -113,7 +116,10 @@ mod tests {
     #[tokio::test]
     async fn mp3_variant_delegates() {
         let h = AnyAudio::Mp3(Mp3Handler::new(bytes::Bytes::from_static(b"mp3-data")));
-        assert_eq!(h.document_type(), DocumentType::Mp3);
+        assert_eq!(
+            h.document_type(),
+            DocumentType::Audio(nvisy_core::fs::AudioFormat::Mp3),
+        );
         assert_eq!(h.encode().unwrap().as_bytes(), b"mp3-data");
     }
 
diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs
index 623e0799..a894880c 100644
--- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs
+++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs
@@ -10,7 +10,7 @@
 use bytes::Bytes;
 use futures::StreamExt;
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{AudioFormat, DocumentType};
 use nvisy_core::io::ContentData;
 use nvisy_core::path::ContentSource;
 
@@ -45,7 +45,7 @@ impl Mp3Handler {
 
 impl Handler for Mp3Handler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Mp3
+        DocumentType::Audio(AudioFormat::Mp3)
     }
 
     #[tracing::instrument(name = "mp3.encode", skip_all, fields(output_bytes))]
diff --git a/crates/nvisy-codec/src/handler/audio/wav_handler.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs
index 5d769bc4..9f689e80 100644
--- a/crates/nvisy-codec/src/handler/audio/wav_handler.rs
+++ b/crates/nvisy-codec/src/handler/audio/wav_handler.rs
@@ -10,7 +10,7 @@
 use bytes::Bytes;
 use futures::StreamExt;
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{AudioFormat, DocumentType};
 use nvisy_core::io::ContentData;
 use nvisy_core::path::ContentSource;
 
@@ -45,7 +45,7 @@ impl WavHandler {
 
 impl Handler for WavHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Wav
+        DocumentType::Audio(AudioFormat::Wav)
     }
 
     #[tracing::instrument(name = "wav.encode", skip_all, fields(output_bytes))]
diff --git a/crates/nvisy-codec/src/handler/image/image_handler.rs b/crates/nvisy-codec/src/handler/image/image_handler.rs
index 97fd54b1..b1089420 100644
--- a/crates/nvisy-codec/src/handler/image/image_handler.rs
+++ b/crates/nvisy-codec/src/handler/image/image_handler.rs
@@ -113,13 +113,19 @@ mod tests {
     #[test]
     fn png_variant_document_type() {
         let h = AnyImage::Png(make_png());
-        assert_eq!(h.document_type(), DocumentType::Png);
+        assert_eq!(
+            h.document_type(),
+            DocumentType::Image(nvisy_core::fs::ImageFormat::Png),
+        );
     }
 
     #[test]
     fn jpeg_variant_document_type() {
         let h = AnyImage::Jpeg(make_jpeg());
-        assert_eq!(h.document_type(), DocumentType::Jpeg);
+        assert_eq!(
+            h.document_type(),
+            DocumentType::Image(nvisy_core::fs::ImageFormat::Jpeg),
+        );
     }
 
     #[tokio::test]
diff --git a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs
index aad853f5..07067513 100644
--- a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs
+++ b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs
@@ -22,7 +22,7 @@ pub struct JpegHandler {
 
 impl_image_handler!(
     JpegHandler,
-    nvisy_core::fs::DocumentType::Jpeg,
+    nvisy_core::fs::DocumentType::Image(nvisy_core::fs::ImageFormat::Jpeg),
     image::ImageFormat::Jpeg,
     "jpeg-handler",
     "jpeg.encode"
diff --git a/crates/nvisy-codec/src/handler/image/png_handler.rs b/crates/nvisy-codec/src/handler/image/png_handler.rs
index c086f85b..922e5aed 100644
--- a/crates/nvisy-codec/src/handler/image/png_handler.rs
+++ b/crates/nvisy-codec/src/handler/image/png_handler.rs
@@ -22,7 +22,7 @@ pub struct PngHandler {
 
 impl_image_handler!(
     PngHandler,
-    nvisy_core::fs::DocumentType::Png,
+    nvisy_core::fs::DocumentType::Image(nvisy_core::fs::ImageFormat::Png),
     image::ImageFormat::Png,
     "png-handler",
     "png.encode"
diff --git a/crates/nvisy-codec/src/handler/rich/docx_handler.rs b/crates/nvisy-codec/src/handler/rich/docx_handler.rs
index d3e1f45a..f317be7b 100644
--- a/crates/nvisy-codec/src/handler/rich/docx_handler.rs
+++ b/crates/nvisy-codec/src/handler/rich/docx_handler.rs
@@ -1,7 +1,7 @@
 //! DOCX handler (stub: awaiting migration to full Loader/Handler pattern).
 
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{DocumentType, WordFormat};
 use nvisy_core::io::ContentData;
 
 use crate::document::{SpanEditStream, SpanStream};
@@ -14,7 +14,7 @@ pub struct DocxHandler;
 
 impl Handler for DocxHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Docx
+        DocumentType::Word(WordFormat::Docx)
     }
 
     #[tracing::instrument(name = "docx.encode", skip_all)]
diff --git a/crates/nvisy-codec/src/handler/text/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs
index e13c3123..f1c5f308 100644
--- a/crates/nvisy-codec/src/handler/text/csv_handler.rs
+++ b/crates/nvisy-codec/src/handler/text/csv_handler.rs
@@ -17,7 +17,7 @@
 
 use futures::StreamExt;
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{DocumentType, SpreadsheetFormat};
 use nvisy_core::io::ContentData;
 use nvisy_core::path::ContentSource;
 
@@ -84,7 +84,7 @@ pub struct CsvHandler {
 
 impl Handler for CsvHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Csv
+        DocumentType::Spreadsheet(SpreadsheetFormat::Csv)
     }
 
     #[tracing::instrument(name = "csv.encode", skip_all, fields(output_bytes))]
diff --git a/crates/nvisy-codec/src/handler/text/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs
index af520dbf..a3d62a44 100644
--- a/crates/nvisy-codec/src/handler/text/csv_loader.rs
+++ b/crates/nvisy-codec/src/handler/text/csv_loader.rs
@@ -137,7 +137,7 @@ mod tests {
     use bytes::Bytes;
     use futures::StreamExt;
     use nvisy_core::Error;
-    use nvisy_core::fs::DocumentType;
+    use nvisy_core::fs::{DocumentType, SpreadsheetFormat};
     use nvisy_core::path::ContentSource;
 
     use super::*;
@@ -151,7 +151,7 @@ mod tests {
         let content = content_from_str("name,age\nAlice,30\nBob,25\n");
         let doc = CsvLoader.decode(&content, &CsvParams::default()).await?;
 
-        assert_eq!(doc.document_type(), DocumentType::Csv);
+        assert_eq!(doc.document_type(), DocumentType::Spreadsheet(SpreadsheetFormat::Csv));
         assert_eq!(
             doc.headers(),
             Some(["name", "age"].map(String::from).as_slice())
diff --git a/crates/nvisy-codec/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs
index 31738f03..c76c069c 100644
--- a/crates/nvisy-codec/src/handler/text/json_handler.rs
+++ b/crates/nvisy-codec/src/handler/text/json_handler.rs
@@ -23,7 +23,7 @@ use std::num::NonZeroU32;
 
 use futures::StreamExt;
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{DocumentType, TextFormat};
 use nvisy_core::io::ContentData;
 use nvisy_core::path::ContentSource;
 use serde::{Deserialize, Serialize};
@@ -127,7 +127,7 @@ pub struct JsonHandler {
 
 impl Handler for JsonHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Json
+        DocumentType::Text(TextFormat::Json)
     }
 
     #[tracing::instrument(name = "json.encode", skip_all, fields(output_bytes))]
diff --git a/crates/nvisy-codec/src/handler/text/json_loader.rs b/crates/nvisy-codec/src/handler/text/json_loader.rs
index a69de583..a0279bbd 100644
--- a/crates/nvisy-codec/src/handler/text/json_loader.rs
+++ b/crates/nvisy-codec/src/handler/text/json_loader.rs
@@ -94,7 +94,7 @@ fn detect_formatting(source: &str) -> (JsonIndent, bool) {
 mod tests {
     use bytes::Bytes;
     use nvisy_core::Error;
-    use nvisy_core::fs::DocumentType;
+    use nvisy_core::fs::{DocumentType, TextFormat};
     use nvisy_core::path::ContentSource;
     use serde_json::json;
 
@@ -109,7 +109,7 @@ mod tests {
         let content = content_from_str(r#"{"name": "Alice", "age": 30}"#);
         let doc = JsonLoader.decode(&content, &JsonParams::default()).await?;
 
-        assert_eq!(doc.document_type(), DocumentType::Json);
+        assert_eq!(doc.document_type(), DocumentType::Text(TextFormat::Json));
         assert_eq!(doc.value(), &json!({"name": "Alice", "age": 30}));
         Ok(())
     }
diff --git a/crates/nvisy-codec/src/handler/text/text_handler.rs b/crates/nvisy-codec/src/handler/text/text_handler.rs
index de1ac44a..621165e9 100644
--- a/crates/nvisy-codec/src/handler/text/text_handler.rs
+++ b/crates/nvisy-codec/src/handler/text/text_handler.rs
@@ -222,7 +222,10 @@ mod tests {
     #[test]
     fn txt_variant_document_type() {
         let h = AnyText::Txt(TxtHandler::new(vec!["hello".into()], false));
-        assert_eq!(h.document_type(), DocumentType::Txt);
+        assert_eq!(
+            h.document_type(),
+            DocumentType::Text(nvisy_core::fs::TextFormat::Txt),
+        );
     }
 
     #[tokio::test]
diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs
index a0c09754..52615f03 100644
--- a/crates/nvisy-codec/src/handler/text/txt_handler.rs
+++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs
@@ -16,7 +16,7 @@
 
 use futures::StreamExt;
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{DocumentType, TextFormat};
 use nvisy_core::io::ContentData;
 use nvisy_core::path::ContentSource;
 
@@ -40,7 +40,7 @@ pub struct TxtHandler {
 
 impl Handler for TxtHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Txt
+        DocumentType::Text(TextFormat::Txt)
     }
 
     #[tracing::instrument(name = "txt.encode", skip_all, fields(output_bytes))]
diff --git a/crates/nvisy-codec/src/handler/text/txt_loader.rs b/crates/nvisy-codec/src/handler/text/txt_loader.rs
index b937517a..5840c977 100644
--- a/crates/nvisy-codec/src/handler/text/txt_loader.rs
+++ b/crates/nvisy-codec/src/handler/text/txt_loader.rs
@@ -53,7 +53,7 @@ mod tests {
     use bytes::Bytes;
     use futures::StreamExt;
     use nvisy_core::Error;
-    use nvisy_core::fs::DocumentType;
+    use nvisy_core::fs::{DocumentType, TextFormat};
     use nvisy_core::path::ContentSource;
 
     use super::*;
@@ -67,7 +67,7 @@ mod tests {
         let content = content_from_str("hello\nworld\n");
         let doc = TxtLoader.decode(&content, &TxtParams::default()).await?;
 
-        assert_eq!(doc.document_type(), DocumentType::Txt);
+        assert_eq!(doc.document_type(), DocumentType::Text(TextFormat::Txt));
         assert_eq!(doc.lines(), &["hello", "world"]);
         assert!(doc.trailing_newline());
         Ok(())
diff --git a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs
index 9e1615d8..2bdcd202 100644
--- a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs
+++ b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs
@@ -1,7 +1,7 @@
 //! XLSX handler (stub: awaiting full spreadsheet support).
 
 use nvisy_core::Error;
-use nvisy_core::fs::DocumentType;
+use nvisy_core::fs::{DocumentType, SpreadsheetFormat};
 use nvisy_core::io::ContentData;
 
 use crate::document::{SpanEditStream, SpanStream};
@@ -13,7 +13,7 @@ pub struct XlsxHandler;
 
 impl Handler for XlsxHandler {
     fn document_type(&self) -> DocumentType {
-        DocumentType::Xlsx
+        DocumentType::Spreadsheet(SpreadsheetFormat::Xlsx)
     }
 
     #[tracing::instrument(name = "xlsx.encode", skip_all)]
diff --git a/crates/nvisy-core/src/fs/content_kind.rs b/crates/nvisy-core/src/fs/content_kind.rs
index ce35cf75..7f0515e3 100644
--- a/crates/nvisy-core/src/fs/content_kind.rs
+++ b/crates/nvisy-core/src/fs/content_kind.rs
@@ -38,8 +38,6 @@ pub enum ContentKind {
     Spreadsheet,
     /// Image files
     Image,
-    /// Archive files (ZIP, TAR, etc.)
-    Archive,
     /// Unknown or unsupported content type
     #[default]
     Unknown,
@@ -69,12 +67,6 @@ impl ContentKind {
     pub fn is_image(&self) -> bool {
         matches!(self, Self::Image)
     }
-
-    /// Check if this content kind represents an archive
-    #[must_use]
-    pub fn is_archive(&self) -> bool {
-        matches!(self, Self::Archive)
-    }
 }
 
 #[cfg(test)]
@@ -94,9 +86,6 @@ mod tests {
 
         assert!(ContentKind::Image.is_image());
         assert!(!ContentKind::Text.is_image());
-
-        assert!(ContentKind::Archive.is_archive());
-        assert!(!ContentKind::Document.is_archive());
     }
 
     #[test]
@@ -105,7 +94,6 @@ mod tests {
         assert_eq!(ContentKind::Document.to_string(), "document");
         assert_eq!(ContentKind::Spreadsheet.to_string(), "spreadsheet");
         assert_eq!(ContentKind::Image.to_string(), "image");
-        assert_eq!(ContentKind::Archive.to_string(), "archive");
         assert_eq!(ContentKind::Unknown.to_string(), "unknown");
     }
 
diff --git a/crates/nvisy-core/src/fs/document_type.rs b/crates/nvisy-core/src/fs/document_type.rs
index 1b7ce30c..8ef7c7ac 100644
--- a/crates/nvisy-core/src/fs/document_type.rs
+++ b/crates/nvisy-core/src/fs/document_type.rs
@@ -1,143 +1,288 @@
 //! Document format classification.
 
+use std::fmt;
+
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
-use strum::{Display, EnumString};
+use strum::{Display, EnumString, IntoStaticStr};
 
-/// Document format that content can be classified as.
-#[derive(
-    Debug,
-    Clone,
-    Copy,
-    PartialEq,
-    Eq,
-    Hash,
-    Display,
-    EnumString,
-    Serialize,
-    Deserialize,
-    JsonSchema
-)]
+/// Image file format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
 #[serde(rename_all = "snake_case")]
 #[strum(serialize_all = "snake_case")]
-pub enum DocumentType {
-    /// Plain text (`.txt`, `.log`, etc.).
-    Txt,
-    /// Comma-separated values.
-    Csv,
-    /// JSON data.
-    Json,
-    /// HTML pages.
-    Html,
-    /// PDF documents.
-    Pdf,
-    /// Microsoft Word (`.docx`).
-    Docx,
-    /// Microsoft Excel (`.xlsx`).
-    Xlsx,
-    /// PNG image.
+pub enum ImageFormat {
     Png,
-    /// JPEG image.
     Jpeg,
-    /// WAV audio.
+    Webp,
+    Gif,
+    Tiff,
+}
+
+impl ImageFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Png => "image/png",
+            Self::Jpeg => "image/jpeg",
+            Self::Webp => "image/webp",
+            Self::Gif => "image/gif",
+            Self::Tiff => "image/tiff",
+        }
+    }
+
+    /// File extension (without leading dot).
+    pub fn extension(self) -> &'static str {
+        match self {
+            Self::Png => "png",
+            Self::Jpeg => "jpeg",
+            Self::Webp => "webp",
+            Self::Gif => "gif",
+            Self::Tiff => "tiff",
+        }
+    }
+
+    /// Parse from a MIME type string.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        match mime {
+            "image/png" => Some(Self::Png),
+            "image/jpeg" => Some(Self::Jpeg),
+            "image/webp" => Some(Self::Webp),
+            "image/gif" => Some(Self::Gif),
+            "image/tiff" => Some(Self::Tiff),
+            _ => None,
+        }
+    }
+}
+
+/// Word-processor document format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+#[strum(serialize_all = "snake_case")]
+pub enum WordFormat {
+    Doc,
+    Docx,
+    Odt,
+}
+
+impl WordFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Doc => "application/msword",
+            Self::Docx => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            Self::Odt => "application/vnd.oasis.opendocument.text",
+        }
+    }
+
+    /// Parse from a MIME type string.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        match mime {
+            "application/msword" => Some(Self::Doc),
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => Some(Self::Docx),
+            "application/vnd.oasis.opendocument.text" => Some(Self::Odt),
+            _ => None,
+        }
+    }
+}
+
+/// Presentation document format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+#[strum(serialize_all = "snake_case")]
+pub enum PresentationFormat {
+    Ppt,
+    Pptx,
+    Odp,
+}
+
+impl PresentationFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Ppt => "application/vnd.ms-powerpoint",
+            Self::Pptx => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            Self::Odp => "application/vnd.oasis.opendocument.presentation",
+        }
+    }
+
+    /// Parse from a MIME type string.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        match mime {
+            "application/vnd.ms-powerpoint" => Some(Self::Ppt),
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation" => Some(Self::Pptx),
+            "application/vnd.oasis.opendocument.presentation" => Some(Self::Odp),
+            _ => None,
+        }
+    }
+}
+
+/// Spreadsheet document format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+#[strum(serialize_all = "snake_case")]
+pub enum SpreadsheetFormat {
+    Xls,
+    Xlsx,
+    Xlsm,
+    Xltx,
+    Csv,
+    Ods,
+}
+
+impl SpreadsheetFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Xls => "application/vnd.ms-excel",
+            Self::Xlsx => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            Self::Xlsm => "application/vnd.ms-excel.sheet.macroEnabled.12",
+            Self::Xltx => "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+            Self::Csv => "text/csv",
+            Self::Ods => "application/vnd.oasis.opendocument.spreadsheet",
+        }
+    }
+
+    /// Parse from a MIME type string.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        match mime {
+            "application/vnd.ms-excel" => Some(Self::Xls),
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => Some(Self::Xlsx),
+            "application/vnd.ms-excel.sheet.macroEnabled.12" => Some(Self::Xlsm),
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => Some(Self::Xltx),
+            "text/csv" => Some(Self::Csv),
+            "application/vnd.oasis.opendocument.spreadsheet" => Some(Self::Ods),
+            _ => None,
+        }
+    }
+}
+
+/// Audio file format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+#[strum(serialize_all = "snake_case")]
+pub enum AudioFormat {
     Wav,
-    /// MP3 audio.
     Mp3,
 }
 
-impl DocumentType {
-    /// Map a MIME type string to a [`DocumentType`].
-    ///
-    /// Returns `None` for unrecognised MIME types.
+impl AudioFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Wav => "audio/wav",
+            Self::Mp3 => "audio/mpeg",
+        }
+    }
+
+    /// Parse from a MIME type string.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        match mime {
+            "audio/wav" | "audio/x-wav" => Some(Self::Wav),
+            "audio/mpeg" => Some(Self::Mp3),
+            _ => None,
+        }
+    }
+}
+
+/// Plain text format.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Display, EnumString, IntoStaticStr, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+#[strum(serialize_all = "snake_case")]
+pub enum TextFormat {
+    Txt,
+    Log,
+    Json,
+}
+
+impl TextFormat {
+    /// MIME type string for this format.
+    pub fn mime_type(self) -> &'static str {
+        match self {
+            Self::Txt | Self::Log => "text/plain",
+            Self::Json => "application/json",
+        }
+    }
+
+    /// Parse from a MIME type string.
     pub fn from_mime(mime: &str) -> Option<Self> {
         match mime {
             "text/plain" => Some(Self::Txt),
-            "text/csv" => Some(Self::Csv),
             "application/json" => Some(Self::Json),
-            "text/html" => Some(Self::Html),
-            "image/png" => Some(Self::Png),
-            "image/jpeg" => Some(Self::Jpeg),
-            "audio/x-wav" | "audio/wav" => Some(Self::Wav),
-            "audio/mpeg" => Some(Self::Mp3),
-            "application/pdf" => Some(Self::Pdf),
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
-                Some(Self::Docx)
-            }
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => Some(Self::Xlsx),
             _ => None,
         }
     }
 }
 
+/// Document format that content can be classified as.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
+#[serde(rename_all = "snake_case")]
+pub enum DocumentType {
+    Text(TextFormat),
+    Image(ImageFormat),
+    Word(WordFormat),
+    Presentation(PresentationFormat),
+    Spreadsheet(SpreadsheetFormat),
+    Audio(AudioFormat),
+    Html,
+    Pdf,
+}
+
+impl fmt::Display for DocumentType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Text(fmt_) => write!(f, "{fmt_}"),
+            Self::Image(fmt_) => write!(f, "{fmt_}"),
+            Self::Word(fmt_) => write!(f, "{fmt_}"),
+            Self::Presentation(fmt_) => write!(f, "{fmt_}"),
+            Self::Spreadsheet(fmt_) => write!(f, "{fmt_}"),
+            Self::Audio(fmt_) => write!(f, "{fmt_}"),
+            Self::Html => write!(f, "html"),
+            Self::Pdf => write!(f, "pdf"),
+        }
+    }
+}
+
+impl DocumentType {
+    /// Map a MIME type string to a [`DocumentType`].
+    ///
+    /// Returns `None` for unrecognised MIME types.
+    pub fn from_mime(mime: &str) -> Option<Self> {
+        None.or_else(|| TextFormat::from_mime(mime).map(Self::Text))
+            .or_else(|| ImageFormat::from_mime(mime).map(Self::Image))
+            .or_else(|| WordFormat::from_mime(mime).map(Self::Word))
+            .or_else(|| PresentationFormat::from_mime(mime).map(Self::Presentation))
+            .or_else(|| SpreadsheetFormat::from_mime(mime).map(Self::Spreadsheet))
+            .or_else(|| AudioFormat::from_mime(mime).map(Self::Audio))
+            .or_else(|| match mime {
+                "text/html" => Some(Self::Html),
+                "application/pdf" => Some(Self::Pdf),
+                _ => None,
+            })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn test_from_mime_text_types() {
-        assert_eq!(
-            DocumentType::from_mime("text/plain"),
-            Some(DocumentType::Txt)
-        );
-        assert_eq!(DocumentType::from_mime("text/csv"), Some(DocumentType::Csv));
-        assert_eq!(
-            DocumentType::from_mime("text/html"),
-            Some(DocumentType::Html)
-        );
-    }
-
-    #[test]
-    fn test_from_mime_application_types() {
-        assert_eq!(
-            DocumentType::from_mime("application/json"),
-            Some(DocumentType::Json)
-        );
-        assert_eq!(
-            DocumentType::from_mime("application/pdf"),
-            Some(DocumentType::Pdf)
-        );
-        assert_eq!(
-            DocumentType::from_mime(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            ),
-            Some(DocumentType::Docx)
-        );
-        assert_eq!(
-            DocumentType::from_mime(
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-            ),
-            Some(DocumentType::Xlsx)
-        );
+    fn from_mime_unknown_returns_none() {
+        assert_eq!(DocumentType::from_mime("application/octet-stream"), None);
+        assert_eq!(DocumentType::from_mime("video/mp4"), None);
+        assert_eq!(DocumentType::from_mime(""), None);
     }
 
     #[test]
-    fn test_from_mime_media_types() {
-        assert_eq!(
-            DocumentType::from_mime("image/png"),
-            Some(DocumentType::Png)
-        );
-        assert_eq!(
-            DocumentType::from_mime("image/jpeg"),
-            Some(DocumentType::Jpeg)
-        );
-        assert_eq!(
-            DocumentType::from_mime("audio/wav"),
-            Some(DocumentType::Wav)
-        );
+    fn from_mime_alias() {
         assert_eq!(
             DocumentType::from_mime("audio/x-wav"),
-            Some(DocumentType::Wav)
-        );
-        assert_eq!(
-            DocumentType::from_mime("audio/mpeg"),
-            Some(DocumentType::Mp3)
+            Some(DocumentType::Audio(AudioFormat::Wav)),
         );
     }
-
-    #[test]
-    fn test_from_mime_unknown() {
-        assert_eq!(DocumentType::from_mime("application/octet-stream"), None);
-        assert_eq!(DocumentType::from_mime("video/mp4"), None);
-        assert_eq!(DocumentType::from_mime(""), None);
-    }
 }
diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs
index 5085d438..05cddbb0 100644
--- a/crates/nvisy-core/src/fs/mod.rs
+++ b/crates/nvisy-core/src/fs/mod.rs
@@ -12,4 +12,7 @@ mod document_type;
 
 pub use content_kind::ContentKind;
 pub use content_metadata::ContentMetadata;
-pub use document_type::DocumentType;
+pub use document_type::{
+    AudioFormat, DocumentType, ImageFormat, PresentationFormat, SpreadsheetFormat, TextFormat,
+    WordFormat,
+};
diff --git a/crates/nvisy-ocr/Cargo.toml b/crates/nvisy-ocr/Cargo.toml
index 44926bf8..a3b75994 100644
--- a/crates/nvisy-ocr/Cargo.toml
+++ b/crates/nvisy-ocr/Cargo.toml
@@ -43,9 +43,6 @@ bytes = { workspace = true, features = [] }
 sha2 = { workspace = true, optional = true, features = [] }
 hmac = { workspace = true, optional = true, features = [] }
 
-# Derive macros and error handling
-strum = { workspace = true, features = [] }
-
 # (De)serialization
 serde = { workspace = true, features = [] }
 serde_json = { workspace = true, features = [] }
diff --git a/crates/nvisy-ocr/src/backend/input.rs b/crates/nvisy-ocr/src/backend/input.rs
index b794081c..723e21c1 100644
--- a/crates/nvisy-ocr/src/backend/input.rs
+++ b/crates/nvisy-ocr/src/backend/input.rs
@@ -3,38 +3,8 @@
 use base64::Engine as _;
 use base64::engine::general_purpose::STANDARD as BASE64;
 use bytes::Bytes;
+use nvisy_core::fs::ImageFormat;
 use nvisy_core::path::ContentSource;
-use strum::{Display, EnumString, IntoStaticStr};
-
-/// Image format passed to a [`Backend`].
-///
-/// [`Backend`]: super::Backend
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-#[derive(Display, EnumString, IntoStaticStr)]
-pub enum ImageFormat {
-    #[strum(serialize = "png")]
-    Png,
-    #[strum(serialize = "jpeg")]
-    Jpeg,
-}
-
-impl ImageFormat {
-    /// MIME type string for this format.
-    pub fn mime_type(self) -> &'static str {
-        match self {
-            Self::Png => "image/png",
-            Self::Jpeg => "image/jpeg",
-        }
-    }
-
-    /// File extension for this format (without leading dot).
-    pub fn extension(self) -> &'static str {
-        match self {
-            Self::Png => "png",
-            Self::Jpeg => "jpeg",
-        }
-    }
-}
 
 /// Image payload passed to [`Backend::run`].
 ///
diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs
index 936668ad..00fa9518 100644
--- a/crates/nvisy-ocr/src/backend/mod.rs
+++ b/crates/nvisy-ocr/src/backend/mod.rs
@@ -3,7 +3,8 @@
 mod input;
 mod output;
 
-pub use input::{ImageFormat, ImageInput};
+pub use input::ImageInput;
+pub use nvisy_core::fs::ImageFormat;
 use nvisy_core::Error;
 pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word};
 use reqwest_middleware::reqwest::Response;

From 0dd104004820205da86f892654460b731a508a38 Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Mon, 9 Mar 2026 14:54:16 +0100
Subject: [PATCH 7/8] chore(deps): upgrade rig-core from 0.31 to 0.32

Remove turbofish type parameters from Client::builder() calls to
match the new builder API where the HTTP client type is set via
.http_client() rather than as a generic parameter.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Cargo.lock                                    | 63 ++++++++++++++-----
 Cargo.toml                                    |  2 +-
 .../src/backend/provider/authenticated.rs     |  6 +-
 .../src/backend/provider/unauthenticated.rs   |  2 +-
 4 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cbbbb4d8..d2bc2dff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -769,20 +769,21 @@ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
 
 [[package]]
 name = "convert_case"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f"
+checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
 dependencies = [
  "unicode-segmentation",
 ]
 
 [[package]]
-name = "convert_case"
-version = "0.10.0"
+name = "core-foundation"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
- "unicode-segmentation",
+ "core-foundation-sys",
+ "libc",
 ]
 
 [[package]]
@@ -1125,7 +1126,7 @@ version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
 dependencies = [
- "convert_case 0.10.0",
+ "convert_case",
  "proc-macro2",
  "quote",
  "rustc_version",
@@ -1902,9 +1903,11 @@ dependencies = [
  "percent-encoding",
  "pin-project-lite",
  "socket2",
+ "system-configuration",
  "tokio",
  "tower-service",
  "tracing",
+ "windows-registry",
 ]
 
 [[package]]
@@ -3979,9 +3982,9 @@ checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce"
 
 [[package]]
 name = "rig-core"
-version = "0.31.0"
+version = "0.32.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "437fa2a15825caf2505411bbe55b05c8eb122e03934938b38f9ecaa1d6ded7c8"
+checksum = "24eb001344690ad016a095c6384b09b93ea12551490b4ed1a197058aeac990d6"
 dependencies = [
  "as-any",
  "async-stream",
@@ -4013,11 +4016,11 @@ dependencies = [
 
 [[package]]
 name = "rig-derive"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f4b48f1449fa214d5cb11d0d0d952fd4c13b7ca5d1eaac64c87ce03cfb9e24"
+checksum = "3b6d9818c9cb13d00664b52fd3e47b0554bc2d5c59cfb90340dd9411b09553bc"
 dependencies = [
- "convert_case 0.8.0",
+ "convert_case",
  "deluxe",
  "indoc",
  "proc-macro2",
@@ -4133,7 +4136,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
 dependencies = [
- "core-foundation",
+ "core-foundation 0.10.1",
  "core-foundation-sys",
  "jni",
  "log",
@@ -4279,7 +4282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38"
 dependencies = [
  "bitflags",
- "core-foundation",
+ "core-foundation 0.10.1",
  "core-foundation-sys",
  "libc",
  "security-framework-sys",
@@ -4709,6 +4712,27 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "system-configuration"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b"
+dependencies = [
+ "bitflags",
+ "core-foundation 0.9.4",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "target-lexicon"
 version = "0.13.5"
@@ -5593,6 +5617,17 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
+[[package]]
+name = "windows-registry"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
+dependencies = [
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
 [[package]]
 name = "windows-result"
 version = "0.4.1"
diff --git a/Cargo.toml b/Cargo.toml
index 62b89ab7..c6cba8b9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,7 +49,7 @@ nvisy-rig = { path = "./crates/nvisy-rig", version = "0.1.0" }
 nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" }
 
 # LLM framework
-rig-core = { version = "0.31", features = [] }
+rig-core = { version = "0.32", features = [] }
 
 # HTTP client and middleware
 reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "multipart"] }
diff --git a/crates/nvisy-rig/src/backend/provider/authenticated.rs b/crates/nvisy-rig/src/backend/provider/authenticated.rs
index 73dcb9c0..67d06aab 100644
--- a/crates/nvisy-rig/src/backend/provider/authenticated.rs
+++ b/crates/nvisy-rig/src/backend/provider/authenticated.rs
@@ -68,7 +68,7 @@ impl AuthenticatedProvider {
         &self,
         http: ClientWithMiddleware,
     ) -> Result<openai::Client<ClientWithMiddleware>, Error> {
-        let mut b = openai::Client::<ClientWithMiddleware>::builder()
+        let mut b = openai::Client::builder()
             .api_key(&self.api_key)
             .http_client(http);
         if let Some(url) = &self.base_url {
@@ -84,7 +84,7 @@ impl AuthenticatedProvider {
         &self,
         http: ClientWithMiddleware,
     ) -> Result<gemini::Client<ClientWithMiddleware>, Error> {
-        let mut b = gemini::Client::<ClientWithMiddleware>::builder()
+        let mut b = gemini::Client::builder()
             .api_key(&self.api_key)
             .http_client(http);
         if let Some(url) = &self.base_url {
@@ -100,7 +100,7 @@ impl AuthenticatedProvider {
         &self,
         http: ClientWithMiddleware,
     ) -> Result<anthropic::Client<ClientWithMiddleware>, Error> {
-        let mut b = anthropic::Client::<ClientWithMiddleware>::builder()
+        let mut b = anthropic::Client::builder()
             .api_key(&self.api_key)
             .http_client(http);
         if let Some(url) = &self.base_url {
diff --git a/crates/nvisy-rig/src/backend/provider/unauthenticated.rs b/crates/nvisy-rig/src/backend/provider/unauthenticated.rs
index 13a49ee7..580ea5a2 100644
--- a/crates/nvisy-rig/src/backend/provider/unauthenticated.rs
+++ b/crates/nvisy-rig/src/backend/provider/unauthenticated.rs
@@ -19,7 +19,7 @@ impl UnauthenticatedProvider {
         &self,
         http: ClientWithMiddleware,
     ) -> Result<ollama::Client<ClientWithMiddleware>, Error> {
-        let mut b = ollama::Client::<ClientWithMiddleware>::builder()
+        let mut b = ollama::Client::builder()
             .api_key(rig::client::Nothing)
             .http_client(http);
         if let Some(url) = &self.base_url {

From 1ff233e088fe08b78dc2df4280a524ea224aeec1 Mon Sep 17 00:00:00 2001
From: Oleh Martsokha <o.martsokha@gmail.com>
Date: Mon, 9 Mar 2026 15:47:02 +0100
Subject: [PATCH 8/8] fix(core): use or instead of or_else for non-lazy match
 in from_mime

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crates/nvisy-core/src/fs/document_type.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/nvisy-core/src/fs/document_type.rs b/crates/nvisy-core/src/fs/document_type.rs
index 8ef7c7ac..f8bd7f5b 100644
--- a/crates/nvisy-core/src/fs/document_type.rs
+++ b/crates/nvisy-core/src/fs/document_type.rs
@@ -259,7 +259,7 @@ impl DocumentType {
             .or_else(|| PresentationFormat::from_mime(mime).map(Self::Presentation))
             .or_else(|| SpreadsheetFormat::from_mime(mime).map(Self::Spreadsheet))
             .or_else(|| AudioFormat::from_mime(mime).map(Self::Audio))
-            .or_else(|| match mime {
+            .or(match mime {
                 "text/html" => Some(Self::Html),
                 "application/pdf" => Some(Self::Pdf),
                 _ => None,