From 0dea4c29578f8eb6672120a6c75594269e1f449d Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 11 Mar 2026 12:58:52 +0100 Subject: [PATCH 01/10] feat(engine): add ApplyPatch trait and typed envelope patch system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move DocumentEnvelope from context/ to its own envelope/ module and introduce ApplyPatch — a trait for merging operation results back into the envelope. Concrete patch types: DetectedEntities, RefinedEntities, PolicyOutcome, OperationEntry. Supports tuples for composite returns. Co-Authored-By: Claude Opus 4.6 --- .../nvisy-engine/src/operation/context/mod.rs | 2 - .../src/operation/envelope/apply.rs | 43 +++++++++++++++++++ .../src/operation/envelope/audit.rs | 21 +++++++++ .../src/operation/envelope/detection.rs | 30 +++++++++++++ .../{context/envelope.rs => envelope/mod.rs} | 24 +++++++++-- .../src/operation/envelope/policy.rs | 21 +++++++++ .../src/operation/lifecycle/encryption.rs | 2 +- crates/nvisy-engine/src/operation/mod.rs | 6 +-- 8 files changed, 140 insertions(+), 9 deletions(-) create mode 100644 crates/nvisy-engine/src/operation/envelope/apply.rs create mode 100644 crates/nvisy-engine/src/operation/envelope/audit.rs create mode 100644 crates/nvisy-engine/src/operation/envelope/detection.rs rename crates/nvisy-engine/src/operation/{context/envelope.rs => envelope/mod.rs} (81%) create mode 100644 crates/nvisy-engine/src/operation/envelope/policy.rs diff --git a/crates/nvisy-engine/src/operation/context/mod.rs b/crates/nvisy-engine/src/operation/context/mod.rs index ccbfc2f..1b99b41 100644 --- a/crates/nvisy-engine/src/operation/context/mod.rs +++ b/crates/nvisy-engine/src/operation/context/mod.rs @@ -19,12 +19,10 @@ //! [`Operation::Input`]: crate::operation::Operation::Input //! [`Operation::Output`]: crate::operation::Operation::Output -mod envelope; mod parallel; mod sequential; mod shared; -pub use envelope::DocumentEnvelope; pub use parallel::ParallelContext; pub use sequential::SequentialContext; pub use shared::SharedContext; diff --git a/crates/nvisy-engine/src/operation/envelope/apply.rs b/crates/nvisy-engine/src/operation/envelope/apply.rs new file mode 100644 index 0000000..3e05a6c --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/apply.rs @@ -0,0 +1,43 @@ +//! The [`ApplyPatch`] trait and blanket implementations. + +use super::DocumentEnvelope; + +/// A value that can be applied to a [`DocumentEnvelope`], merging +/// operation results into the shared pipeline state. +/// +/// Each operation returns a concrete patch type; the orchestrator +/// calls [`apply`](ApplyPatch::apply) to fold it into the envelope +/// without needing to know the operation's internals. +pub trait ApplyPatch { + /// Merge this patch into the envelope. + fn apply(self, envelope: &mut DocumentEnvelope); +} + +/// A no-op patch for operations that don't modify the envelope. +impl ApplyPatch for () { + fn apply(self, _envelope: &mut DocumentEnvelope) {} +} + +/// Apply multiple patches of the same type in sequence. +impl ApplyPatch for Vec

{ + fn apply(self, envelope: &mut DocumentEnvelope) { + for patch in self { + patch.apply(envelope); + } + } +} + +impl ApplyPatch for (A, B) { + fn apply(self, envelope: &mut DocumentEnvelope) { + self.0.apply(envelope); + self.1.apply(envelope); + } +} + +impl ApplyPatch for (A, B, C) { + fn apply(self, envelope: &mut DocumentEnvelope) { + self.0.apply(envelope); + self.1.apply(envelope); + self.2.apply(envelope); + } +} diff --git a/crates/nvisy-engine/src/operation/envelope/audit.rs b/crates/nvisy-engine/src/operation/envelope/audit.rs new file mode 100644 index 0000000..3cf5481 --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/audit.rs @@ -0,0 +1,21 @@ +//! Audit entry patches. + +use crate::provenance::AuditEntry; + +use super::apply::ApplyPatch; +use super::DocumentEnvelope; + +/// A single audit log entry recording what an operation did. +pub struct OperationEntry(pub AuditEntry); + +impl ApplyPatch for OperationEntry { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.push_entry(self.0); + } +} + +impl ApplyPatch for AuditEntry { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.push_entry(self); + } +} diff --git a/crates/nvisy-engine/src/operation/envelope/detection.rs b/crates/nvisy-engine/src/operation/envelope/detection.rs new file mode 100644 index 0000000..53bb769 --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/detection.rs @@ -0,0 +1,30 @@ +//! Entity detection patches. + +use nvisy_ontology::entity::Entities; + +use super::apply::ApplyPatch; +use super::DocumentEnvelope; + +/// New entities discovered by a detection operation (NER, OCR, CV, +/// pattern match, manual annotation). +/// +/// Appended to the envelope's existing entity set. +pub struct DetectedEntities(pub Entities); + +impl ApplyPatch for DetectedEntities { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.entities.extend(self.0); + } +} + +/// A fully recomputed entity set produced by refinement operations +/// (deduplication, ensemble fusion). +/// +/// Replaces the envelope's entity set entirely. +pub struct RefinedEntities(pub Entities); + +impl ApplyPatch for RefinedEntities { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.entities = self.0; + } +} diff --git a/crates/nvisy-engine/src/operation/context/envelope.rs b/crates/nvisy-engine/src/operation/envelope/mod.rs similarity index 81% rename from crates/nvisy-engine/src/operation/context/envelope.rs rename to crates/nvisy-engine/src/operation/envelope/mod.rs index 0624a16..42cdc5d 100644 --- a/crates/nvisy-engine/src/operation/context/envelope.rs +++ b/crates/nvisy-engine/src/operation/envelope/mod.rs @@ -9,15 +9,28 @@ //! ContentData //! ↓ Import //! DocumentEnvelope { document, … } -//! ↓ OCR / NER / CV / PatternMatch +//! ↓ OCR / NER / CV / PatternMatch → DetectedEntities //! DocumentEnvelope { document, entities, … } -//! ↓ Deduplication / Ensemble +//! ↓ Deduplication / Ensemble → RefinedEntities //! DocumentEnvelope { document, entities (merged), … } -//! ↓ PolicyEvaluation +//! ↓ PolicyEvaluation → PolicyOutcome //! DocumentEnvelope { document, entities, audit { decisions, records }, … } //! ↓ Redaction //! DocumentEnvelope { document (redacted), entities, audit { … } } //! ``` +//! +//! Operations produce typed patch values that implement [`ApplyPatch`]. +//! The orchestrator merges each patch via [`DocumentEnvelope::apply`]. + +mod apply; +mod audit; +mod detection; +mod policy; + +pub use apply::ApplyPatch; +pub use audit::OperationEntry; +pub use detection::{DetectedEntities, RefinedEntities}; +pub use policy::PolicyOutcome; use nvisy_codec::Document; use nvisy_ontology::entity::Entities; @@ -69,6 +82,11 @@ impl DocumentEnvelope { pub fn entity_count(&self) -> usize { self.entities.len() } + + /// Merge an operation's output into this envelope. + pub fn apply(&mut self, patch: impl ApplyPatch) { + patch.apply(self); + } } impl std::fmt::Debug for DocumentEnvelope { diff --git a/crates/nvisy-engine/src/operation/envelope/policy.rs b/crates/nvisy-engine/src/operation/envelope/policy.rs new file mode 100644 index 0000000..dffced3 --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/policy.rs @@ -0,0 +1,21 @@ +//! Policy evaluation patches. + +use crate::provenance::{RedactionDecision, RedactionRecord}; + +use super::apply::ApplyPatch; +use super::DocumentEnvelope; + +/// Redaction decisions and audit records produced by policy evaluation. +pub struct PolicyOutcome { + /// How each entity should be redacted. + pub decisions: Vec, + /// Audit-facing records of what was decided. + pub records: Vec, +} + +impl ApplyPatch for PolicyOutcome { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.decisions.extend(self.decisions); + envelope.audit.records.extend(self.records); + } +} diff --git a/crates/nvisy-engine/src/operation/lifecycle/encryption.rs b/crates/nvisy-engine/src/operation/lifecycle/encryption.rs index 4fcb843..35cd010 100644 --- a/crates/nvisy-engine/src/operation/lifecycle/encryption.rs +++ b/crates/nvisy-engine/src/operation/lifecycle/encryption.rs @@ -7,7 +7,7 @@ use aes_gcm::{Aes256Gcm, KeyInit, Nonce}; use nvisy_core::{Error, ErrorKind, Result}; use rand::RngExt; -use crate::operation::context::DocumentEnvelope; +use crate::operation::envelope::DocumentEnvelope; use crate::operation::utility::crypto::{ EncryptedContent, EncryptionAlgorithm, KeyProvider, NONCE_SIZE, WireEnvelope, }; diff --git a/crates/nvisy-engine/src/operation/mod.rs b/crates/nvisy-engine/src/operation/mod.rs index 1e8e20a..caa642c 100644 --- a/crates/nvisy-engine/src/operation/mod.rs +++ b/crates/nvisy-engine/src/operation/mod.rs @@ -15,6 +15,7 @@ //! | Lifecycle | [`lifecycle`] | Content I/O (import, export, encrypt) | mod context; +pub mod envelope; pub mod inference; pub mod lifecycle; pub mod processing; @@ -22,9 +23,8 @@ pub mod utility; use std::future::Future; -pub use context::{ - DocumentEnvelope, OperationContext, ParallelContext, SequentialContext, SharedContext, -}; +pub use context::{OperationContext, ParallelContext, SequentialContext, SharedContext}; +pub use envelope::DocumentEnvelope; use nvisy_core::Result; /// A single unit of work in the redaction pipeline. From d175d9a252da6cc3c29b21c5e29e732777b8354f Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 11 Mar 2026 13:09:08 +0100 Subject: [PATCH 02/10] refactor(engine): update operations to return typed patch types Detection operations (NER, CV, OcrVerification, PatternMatch) now return DetectedEntities; refinement operations (Deduplication, Ensemble) return RefinedEntities; EvaluatePolicy returns PolicyOutcome instead of the removed EvaluatePolicyOutput. Co-Authored-By: Claude Opus 4.6 --- .../nvisy-engine/src/operation/envelope/audit.rs | 5 ++--- .../src/operation/envelope/detection.rs | 2 +- crates/nvisy-engine/src/operation/envelope/mod.rs | 3 +-- .../nvisy-engine/src/operation/envelope/policy.rs | 5 ++--- .../src/operation/inference/computer_vision.rs | 9 +++++---- .../nvisy-engine/src/operation/inference/ner.rs | 9 +++++---- .../src/operation/inference/ocr_verification.rs | 11 ++++++----- .../src/operation/processing/deduplication.rs | 7 ++++--- .../src/operation/processing/ensemble_fusion.rs | 7 ++++--- .../src/operation/processing/pattern_match.rs | 13 +++++++------ .../src/operation/processing/policy_evaluation.rs | 15 ++++----------- 11 files changed, 41 insertions(+), 45 deletions(-) diff --git a/crates/nvisy-engine/src/operation/envelope/audit.rs b/crates/nvisy-engine/src/operation/envelope/audit.rs index 3cf5481..2b5110b 100644 --- a/crates/nvisy-engine/src/operation/envelope/audit.rs +++ b/crates/nvisy-engine/src/operation/envelope/audit.rs @@ -1,9 +1,8 @@ //! Audit entry patches. -use crate::provenance::AuditEntry; - -use super::apply::ApplyPatch; use super::DocumentEnvelope; +use super::apply::ApplyPatch; +use crate::provenance::AuditEntry; /// A single audit log entry recording what an operation did. pub struct OperationEntry(pub AuditEntry); diff --git a/crates/nvisy-engine/src/operation/envelope/detection.rs b/crates/nvisy-engine/src/operation/envelope/detection.rs index 53bb769..144969f 100644 --- a/crates/nvisy-engine/src/operation/envelope/detection.rs +++ b/crates/nvisy-engine/src/operation/envelope/detection.rs @@ -2,8 +2,8 @@ use nvisy_ontology::entity::Entities; -use super::apply::ApplyPatch; use super::DocumentEnvelope; +use super::apply::ApplyPatch; /// New entities discovered by a detection operation (NER, OCR, CV, /// pattern match, manual annotation). diff --git a/crates/nvisy-engine/src/operation/envelope/mod.rs b/crates/nvisy-engine/src/operation/envelope/mod.rs index 42cdc5d..d19e33f 100644 --- a/crates/nvisy-engine/src/operation/envelope/mod.rs +++ b/crates/nvisy-engine/src/operation/envelope/mod.rs @@ -30,10 +30,9 @@ mod policy; pub use apply::ApplyPatch; pub use audit::OperationEntry; pub use detection::{DetectedEntities, RefinedEntities}; -pub use policy::PolicyOutcome; - use nvisy_codec::Document; use nvisy_ontology::entity::Entities; +pub use policy::PolicyOutcome; use crate::provenance::Audit; diff --git a/crates/nvisy-engine/src/operation/envelope/policy.rs b/crates/nvisy-engine/src/operation/envelope/policy.rs index dffced3..078dac4 100644 --- a/crates/nvisy-engine/src/operation/envelope/policy.rs +++ b/crates/nvisy-engine/src/operation/envelope/policy.rs @@ -1,9 +1,8 @@ //! Policy evaluation patches. -use crate::provenance::{RedactionDecision, RedactionRecord}; - -use super::apply::ApplyPatch; use super::DocumentEnvelope; +use super::apply::ApplyPatch; +use crate::provenance::{RedactionDecision, RedactionRecord}; /// Redaction decisions and audit records produced by policy evaluation. pub struct PolicyOutcome { diff --git a/crates/nvisy-engine/src/operation/inference/computer_vision.rs b/crates/nvisy-engine/src/operation/inference/computer_vision.rs index 1650fb9..5e60a0e 100644 --- a/crates/nvisy-engine/src/operation/inference/computer_vision.rs +++ b/crates/nvisy-engine/src/operation/inference/computer_vision.rs @@ -7,9 +7,10 @@ use nvisy_codec::Span; use nvisy_codec::handler::ImageData; use nvisy_core::math::BoundingBox; use nvisy_core::{Error, Result}; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, ImageLocation}; +use nvisy_ontology::entity::{DetectionMethod, Entity, ImageLocation}; use nvisy_rig::agent::{CvAgent, CvEntity, DetectionConfig}; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::computer_vision"; @@ -26,7 +27,7 @@ impl ComputerVision { Self { agent, config } } - async fn detect(&self, spans: Vec>) -> Result { + async fn detect(&self, spans: Vec>) -> Result { tracing::debug!(target: TARGET, span_count = spans.len(), "detecting entities"); let mut entities = Vec::new(); @@ -45,13 +46,13 @@ impl ComputerVision { } } - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for ComputerVision { type Input = ParallelContext>>; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|spans| self.detect(spans)).await diff --git a/crates/nvisy-engine/src/operation/inference/ner.rs b/crates/nvisy-engine/src/operation/inference/ner.rs index 5db4880..4b38aee 100644 --- a/crates/nvisy-engine/src/operation/inference/ner.rs +++ b/crates/nvisy-engine/src/operation/inference/ner.rs @@ -9,7 +9,7 @@ use nvisy_codec::handler::TxtSpan; use nvisy_core::{Error, Result}; use nvisy_http::HttpClient; use nvisy_ontology::entity::{ - DetectionMethod, Entities, Entity, EntityCategory, EntityKind, TextLocation, + DetectionMethod, Entity, EntityCategory, EntityKind, TextLocation, }; use nvisy_rig::agent::{ AgentConfig, AgentProvider, DetectionConfig, KnownNerEntity, NerAgent, NerContext, @@ -17,6 +17,7 @@ use nvisy_rig::agent::{ use serde::Deserialize; use tokio::sync::Mutex; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, SequentialContext}; const TARGET: &str = "nvisy_engine::op::ner"; @@ -100,7 +101,7 @@ impl Ner { state.known_entities.clear(); } - async fn detect(&self, spans: Vec>) -> Result { + async fn detect(&self, spans: Vec>) -> Result { tracing::debug!(target: TARGET, span_count = spans.len(), "running NER"); let mut entities = Vec::new(); @@ -169,13 +170,13 @@ impl Ner { state.known_entities = merge_ctx.known_entities; } - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for Ner { type Input = SequentialContext>>; - type Output = SequentialContext; + type Output = SequentialContext; async fn call(&self, input: Self::Input) -> Result { input.sequential_map(|spans| self.detect(spans)).await diff --git a/crates/nvisy-engine/src/operation/inference/ocr_verification.rs b/crates/nvisy-engine/src/operation/inference/ocr_verification.rs index d71df01..d5029cc 100644 --- a/crates/nvisy-engine/src/operation/inference/ocr_verification.rs +++ b/crates/nvisy-engine/src/operation/inference/ocr_verification.rs @@ -12,6 +12,7 @@ use nvisy_core::{Error, Result}; use nvisy_ontology::entity::Entities; use nvisy_rig::agent::OcrAgent; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::ocr_verification"; @@ -38,16 +39,16 @@ impl OcrVerification { Self { agent } } - async fn verify(&self, data: OcrVerificationInput) -> Result { + async fn verify(&self, data: OcrVerificationInput) -> Result { if data.entities.is_empty() { tracing::debug!(target: TARGET, "no entities to verify"); - return Ok(Entities::new()); + return Ok(DetectedEntities(Entities::new())); } tracing::debug!(target: TARGET, entity_count = data.entities.len(), "verifying entities"); let image_bytes = match data.image_spans.first() { Some(span) => span.data.encode_png()?, - None => return Ok(data.entities), + None => return Ok(DetectedEntities(data.entities)), }; let entities = self @@ -56,13 +57,13 @@ impl OcrVerification { .await .map_err(|e| Error::runtime(e.to_string(), "ocr-verification", e.is_retryable()))?; - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for OcrVerification { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.verify(data)).await diff --git a/crates/nvisy-engine/src/operation/processing/deduplication.rs b/crates/nvisy-engine/src/operation/processing/deduplication.rs index 6a8a37d..c4b1d5e 100644 --- a/crates/nvisy-engine/src/operation/processing/deduplication.rs +++ b/crates/nvisy-engine/src/operation/processing/deduplication.rs @@ -7,6 +7,7 @@ use nvisy_core::Result; use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::deduplication"; @@ -23,11 +24,11 @@ const TARGET: &str = "nvisy_engine::op::deduplication"; pub struct Deduplication; impl Deduplication { - async fn deduplicate(&self, entities: Entities) -> Result { + async fn deduplicate(&self, entities: Entities) -> Result { let before = entities.len(); let result = Self::execute(entities); tracing::debug!(target: TARGET, before, after = result.len(), "deduplicated entities"); - Ok(result) + Ok(RefinedEntities(result)) } /// Deduplicate and merge overlapping entities. @@ -66,7 +67,7 @@ impl Deduplication { impl Operation for Deduplication { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.deduplicate(data)).await diff --git a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs index 75155a5..a9cb7e2 100644 --- a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs +++ b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs @@ -6,6 +6,7 @@ use std::collections::HashMap; use nvisy_core::Result; use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::ensemble"; @@ -35,11 +36,11 @@ impl Ensemble { Self { strategy } } - async fn fuse(&self, entities: Entities) -> Result { + async fn fuse(&self, entities: Entities) -> Result { let before = entities.len(); let result = self.merge(entities); tracing::debug!(target: TARGET, before, after = result.len(), "fused entities"); - Ok(result) + Ok(RefinedEntities(result)) } /// Group entities by `(kind, value, overlapping location)` then fuse @@ -114,7 +115,7 @@ impl Ensemble { impl Operation for Ensemble { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.fuse(data)).await diff --git a/crates/nvisy-engine/src/operation/processing/pattern_match.rs b/crates/nvisy-engine/src/operation/processing/pattern_match.rs index 3174432..216f94e 100644 --- a/crates/nvisy-engine/src/operation/processing/pattern_match.rs +++ b/crates/nvisy-engine/src/operation/processing/pattern_match.rs @@ -14,6 +14,7 @@ use nvisy_pattern::{ use serde::Deserialize; use serde_json::Value; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::pattern_match"; @@ -61,20 +62,20 @@ impl PatternMatch { } impl PatternMatch { - async fn scan(&self, data: PatternInput) -> Result { + async fn scan(&self, data: PatternInput) -> Result { tracing::debug!(target: TARGET, "scanning for patterns"); match data { - PatternInput::Text(spans) => self.detect_text(spans), - PatternInput::Csv(spans) => self.detect_csv(spans), - PatternInput::Html(spans) => self.detect_html(spans), - PatternInput::Json(spans) => self.detect_json(spans), + PatternInput::Text(spans) => self.detect_text(spans).map(DetectedEntities), + PatternInput::Csv(spans) => self.detect_csv(spans).map(DetectedEntities), + PatternInput::Html(spans) => self.detect_html(spans).map(DetectedEntities), + PatternInput::Json(spans) => self.detect_json(spans).map(DetectedEntities), } } } impl Operation for PatternMatch { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.scan(data)).await diff --git a/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs b/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs index 519ae36..9e93793 100644 --- a/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs +++ b/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs @@ -13,6 +13,7 @@ use nvisy_ontology::entity::{Entities, Entity}; use nvisy_ontology::policy::{PolicyRule, RuleAction, Strategy, TextStrategy}; use serde::Deserialize; +use crate::operation::envelope::PolicyOutcome; use crate::operation::{Operation, ParallelContext}; use crate::provenance::{RedactionDecision, RedactionRecord}; @@ -40,14 +41,6 @@ fn default_threshold() -> f64 { 0.5 } -/// Output of policy evaluation: both pipeline decisions and audit records. -pub struct EvaluatePolicyOutput { - /// Pipeline-facing redaction decisions. - pub decisions: Vec, - /// Audit-facing redaction records. - pub records: Vec, -} - /// Evaluates policy rules against detected entities and produces /// [`RedactionDecision`] and [`RedactionRecord`] pairs. /// @@ -64,7 +57,7 @@ impl EvaluatePolicy { Ok(Self { params }) } - pub async fn execute(&self, entities: Entities) -> Result { + pub async fn execute(&self, entities: Entities) -> Result { tracing::debug!(target: TARGET, entity_count = entities.len(), "evaluating policies"); let default_spec = &self.params.default_spec; let default_threshold = self.params.default_confidence_threshold; @@ -116,13 +109,13 @@ impl EvaluatePolicy { records.push(record); } - Ok(EvaluatePolicyOutput { decisions, records }) + Ok(PolicyOutcome { decisions, records }) } } impl Operation for EvaluatePolicy { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.execute(data)).await From 9dc92b420204cbbcccfd15a7ef071c5a409f4a90 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 18:21:31 +0100 Subject: [PATCH 03/10] refactor(ontology): expand entity taxonomy with new kinds, categories, and methods Add new EntityKind variants (VehicleId, LicensePlate, CardExpiry, BankRouting, SwiftCode, CryptoAddress, Amount, InsuranceId, PrescriptionId, Medication, FacialGeometry, Voiceprint, RetinaScan, AuthToken, PrivateKey, DeviceId, GeolocationMetadata, Handwriting, Logo, Barcode, DepartmentName, FacilityName, CaseNumber, InternalId, DateTime). Add EntityCategory::Visual. Expand ExtractionMethod and RecognitionMethod with new variants. Update sensitivity and category mappings, pattern assets, and all downstream consumers. Co-Authored-By: Claude Opus 4.6 --- .../operation/inference/computer_vision.rs | 13 +- .../src/operation/inference/ner.rs | 8 +- .../src/operation/processing/deduplication.rs | 71 ++-- .../operation/processing/ensemble_fusion.rs | 67 ++-- .../operation/processing/manual_detection.rs | 16 +- .../src/operation/processing/pattern_match.rs | 264 +++------------ crates/nvisy-engine/src/pipeline/ontology.rs | 8 +- crates/nvisy-engine/src/provenance/kind.rs | 22 +- crates/nvisy-ontology/src/entity/category.rs | 62 ++-- crates/nvisy-ontology/src/entity/kind.rs | 303 ++++++++++++------ crates/nvisy-ontology/src/entity/method.rs | 153 +++++++-- crates/nvisy-ontology/src/entity/mod.rs | 69 +++- crates/nvisy-pattern/README.md | 9 +- .../assets/patterns/date_of_birth.json | 2 +- .../nvisy-pattern/assets/patterns/email.json | 2 +- .../nvisy-pattern/assets/patterns/ipv4.json | 2 +- .../nvisy-pattern/assets/patterns/ipv6.json | 2 +- .../assets/patterns/languages.json | 4 +- .../assets/patterns/mac_address.json | 2 +- .../assets/patterns/nationalities.json | 4 +- .../nvisy-pattern/assets/patterns/phone.json | 2 +- .../assets/patterns/religions.json | 4 +- crates/nvisy-pattern/assets/patterns/ssn.json | 2 +- crates/nvisy-pattern/assets/patterns/url.json | 2 +- .../assets/patterns/us_drivers_license.json | 2 +- .../assets/patterns/us_passport.json | 2 +- .../assets/patterns/us_postal_code.json | 2 +- crates/nvisy-rig/src/agent/base/response.rs | 4 +- crates/nvisy-rig/src/agent/cv/prompt.rs | 11 +- crates/nvisy-rig/src/agent/ocr/output.rs | 11 +- 30 files changed, 629 insertions(+), 496 deletions(-) diff --git a/crates/nvisy-engine/src/operation/inference/computer_vision.rs b/crates/nvisy-engine/src/operation/inference/computer_vision.rs index 5e60a0e..5ad449b 100644 --- a/crates/nvisy-engine/src/operation/inference/computer_vision.rs +++ b/crates/nvisy-engine/src/operation/inference/computer_vision.rs @@ -7,7 +7,7 @@ use nvisy_codec::Span; use nvisy_codec::handler::ImageData; use nvisy_core::math::BoundingBox; use nvisy_core::{Error, Result}; -use nvisy_ontology::entity::{DetectionMethod, Entity, ImageLocation}; +use nvisy_ontology::entity::{Entity, ExtractionMethod, ImageLocation, RecognitionMethod}; use nvisy_rig::agent::{CvAgent, CvEntity, DetectionConfig}; use crate::operation::envelope::DetectedEntities; @@ -61,14 +61,15 @@ impl Operation for ComputerVision { /// Convert a [`CvEntity`] to an [`Entity`] with [`ImageLocation`]. fn map_cv_entity(cv: &CvEntity) -> Entity { - Entity::new( - cv.category.clone(), + let mut entity = Entity::new( + cv.category, cv.entity_type, &cv.label, - DetectionMethod::ObjectDetection, + RecognitionMethod::Classification, cv.confidence, - ) - .with_location( + ); + entity.extraction_methods = vec![ExtractionMethod::ObjectDetection]; + entity.with_location( ImageLocation { bounding_box: BoundingBox { x: cv.bbox[0], diff --git a/crates/nvisy-engine/src/operation/inference/ner.rs b/crates/nvisy-engine/src/operation/inference/ner.rs index 4b38aee..0e35027 100644 --- a/crates/nvisy-engine/src/operation/inference/ner.rs +++ b/crates/nvisy-engine/src/operation/inference/ner.rs @@ -8,9 +8,7 @@ use nvisy_codec::Span; use nvisy_codec::handler::TxtSpan; use nvisy_core::{Error, Result}; use nvisy_http::HttpClient; -use nvisy_ontology::entity::{ - DetectionMethod, Entity, EntityCategory, EntityKind, TextLocation, -}; +use nvisy_ontology::entity::{Entity, EntityCategory, EntityKind, RecognitionMethod, TextLocation}; use nvisy_rig::agent::{ AgentConfig, AgentProvider, DetectionConfig, KnownNerEntity, NerAgent, NerContext, }; @@ -120,7 +118,7 @@ impl Ner { for ner_entity in &ner_entities { let category: EntityCategory = match ner_entity.category { - Some(ref c) => c.clone(), + Some(c) => c, None => continue, }; let entity_kind = match ner_entity.entity_type { @@ -136,7 +134,7 @@ impl Ner { category, entity_kind, &ner_entity.value, - DetectionMethod::Ner, + RecognitionMethod::Ner, confidence, ); diff --git a/crates/nvisy-engine/src/operation/processing/deduplication.rs b/crates/nvisy-engine/src/operation/processing/deduplication.rs index c4b1d5e..23c5b18 100644 --- a/crates/nvisy-engine/src/operation/processing/deduplication.rs +++ b/crates/nvisy-engine/src/operation/processing/deduplication.rs @@ -2,10 +2,10 @@ //! //! Merges entities that share the same `entity_kind`, `value`, and //! overlapping location into a single entity with the highest -//! confidence and `DetectionMethod::Composite` when methods differ. +//! confidence and combined recognition methods. use nvisy_core::Result; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use nvisy_ontology::entity::{Entities, Entity, Location, RefinementMethod}; use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; @@ -19,8 +19,8 @@ const TARGET: &str = "nvisy_engine::op::deduplication"; /// /// When merging: /// - The highest confidence score is kept. -/// - If the detection methods differ, the merged entity uses -/// `DetectionMethod::Composite`. +/// - Recognition methods are combined into an ordered vector. +/// - [`RefinementMethod::Deduplication`] is recorded on the merged entity. pub struct Deduplication; impl Deduplication { @@ -51,8 +51,18 @@ impl Deduplication { if entity.confidence > existing.confidence { existing.confidence = entity.confidence; } - if existing.detection_method != entity.detection_method { - existing.detection_method = DetectionMethod::Composite; + for m in entity.recognition_methods { + if !existing.recognition_methods.contains(&m) { + existing.recognition_methods.push(m); + } + } + if !existing + .refinement_methods + .contains(&RefinementMethod::Deduplication) + { + existing + .refinement_methods + .push(RefinementMethod::Deduplication); } } None => { @@ -88,19 +98,19 @@ fn locations_overlap(a: &Option, b: &Option) -> bool { #[cfg(test)] mod tests { - use nvisy_ontology::entity::{EntityCategory, EntityKind, TextLocation}; + use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod, TextLocation}; use super::*; fn text_entity( value: &str, - method: DetectionMethod, + method: RecognitionMethod, confidence: f64, start: usize, end: usize, ) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, method, @@ -119,34 +129,44 @@ mod tests { #[test] fn duplicates_merged_same_method() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Regex, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.9, 0, 4), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.9).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Regex); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex] + ); + assert_eq!( + result[0].refinement_methods, + vec![RefinementMethod::Deduplication] + ); } #[test] - fn different_methods_become_composite() { + fn different_methods_are_combined() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.85, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.85, 0, 4), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); assert!((result[0].confidence - 0.85).abs() < f64::EPSILON); } #[test] fn non_overlapping_preserved() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Regex, 0.9, 10, 14), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.9, 10, 14), ] .into(); let result = Deduplication::execute(entities); @@ -156,8 +176,8 @@ mod tests { #[test] fn different_values_not_merged() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("Jane", DetectionMethod::Regex, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("Jane", RecognitionMethod::Regex, 0.9, 0, 4), ] .into(); let result = Deduplication::execute(entities); @@ -173,7 +193,7 @@ mod tests { #[test] fn single_entity_unchanged() { let entities: Entities = - vec![text_entity("John", DetectionMethod::Regex, 0.8, 0, 4)].into(); + vec![text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4)].into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); } @@ -182,12 +202,15 @@ mod tests { fn overlapping_ranges_merge() { // Partially overlapping: 0..6 and 3..9. let entities: Entities = vec![ - text_entity("John Doe", DetectionMethod::Regex, 0.7, 0, 6), - text_entity("John Doe", DetectionMethod::Ner, 0.9, 3, 9), + text_entity("John Doe", RecognitionMethod::Regex, 0.7, 0, 6), + text_entity("John Doe", RecognitionMethod::Ner, 0.9, 3, 9), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); } } diff --git a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs index a9cb7e2..4dbbaa0 100644 --- a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs +++ b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use nvisy_core::Result; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use nvisy_ontology::entity::{Entities, Entity, Location, RecognitionMethod, RefinementMethod}; use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; @@ -16,9 +16,9 @@ const TARGET: &str = "nvisy_engine::op::ensemble"; pub enum FusionStrategy { /// Take the maximum confidence across all detectors. MaxConfidence, - /// Weighted average by detection method. + /// Weighted average by recognition method. WeightedAverage { - weights: HashMap, + weights: HashMap, }, /// Noisy-OR: `P = 1 − ∏(1 − pᵢ)` for independent detectors. NoisyOr, @@ -88,7 +88,9 @@ impl Ensemble { let mut total_weight = 0.0; let mut weighted_sum = 0.0; for e in &group { - let w = weights.get(&e.detection_method).copied().unwrap_or(1.0); + // Use the first recognition method for weight lookup. + let primary = e.recognition_methods.first(); + let w = primary.and_then(|m| weights.get(m)).copied().unwrap_or(1.0); weighted_sum += e.confidence * w; total_weight += w; } @@ -105,10 +107,23 @@ impl Ensemble { } }; - // Use the first entity as the base and update confidence/method. + // Collect all recognition methods from the group in order. + let mut merged_methods = Vec::new(); + for e in &group { + for m in &e.recognition_methods { + if !merged_methods.contains(m) { + merged_methods.push(*m); + } + } + } + + // Use the first entity as the base and update confidence/methods. let mut result = group.into_iter().next().unwrap(); result.confidence = fused_confidence; - result.detection_method = DetectionMethod::Composite; + result.recognition_methods = merged_methods; + result + .refinement_methods + .push(RefinementMethod::EnsembleFusion); result } } @@ -139,13 +154,13 @@ mod tests { fn text_entity( value: &str, - method: DetectionMethod, + method: RecognitionMethod, confidence: f64, start: usize, end: usize, ) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, method, @@ -165,22 +180,29 @@ mod tests { fn max_confidence_strategy() { let merge = Ensemble::new(FusionStrategy::MaxConfidence); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.85, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.85, 0, 4), ] .into(); let result = merge.merge(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.85).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); + assert_eq!( + result[0].refinement_methods, + vec![RefinementMethod::EnsembleFusion] + ); } #[test] fn noisy_or_strategy() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.8, 0, 4), ] .into(); let result = merge.merge(entities); @@ -192,13 +214,13 @@ mod tests { #[test] fn weighted_average_strategy() { let mut weights = HashMap::new(); - weights.insert(DetectionMethod::Regex, 1.0); - weights.insert(DetectionMethod::Ner, 2.0); + weights.insert(RecognitionMethod::Regex, 1.0); + weights.insert(RecognitionMethod::Ner, 2.0); let merge = Ensemble::new(FusionStrategy::WeightedAverage { weights }); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.6, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.6, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.9, 0, 4), ] .into(); let result = merge.merge(entities); @@ -211,8 +233,8 @@ mod tests { fn non_overlapping_not_merged() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.8, 10, 14), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.8, 10, 14), ] .into(); let result = merge.merge(entities); @@ -223,11 +245,14 @@ mod tests { fn single_entity_unchanged() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = - vec![text_entity("John", DetectionMethod::Regex, 0.7, 0, 4)].into(); + vec![text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4)].into(); let result = merge.merge(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.7).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Regex); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex] + ); } #[test] diff --git a/crates/nvisy-engine/src/operation/processing/manual_detection.rs b/crates/nvisy-engine/src/operation/processing/manual_detection.rs index f5cd427..59ce3b7 100644 --- a/crates/nvisy-engine/src/operation/processing/manual_detection.rs +++ b/crates/nvisy-engine/src/operation/processing/manual_detection.rs @@ -5,7 +5,7 @@ use nvisy_core::Result; use nvisy_ontology::entity::{ - Annotation, AnnotationKind, DetectionMethod, Entities, Entity, Location, + Annotation, AnnotationKind, Entities, Entity, Location, RecognitionMethod, }; use serde::Deserialize; @@ -37,7 +37,7 @@ pub struct ManualOutput { } /// Converts each inclusion [`Annotation`] into a full [`Entity`] with -/// `DetectionMethod::Manual` and confidence 1.0. Collects exclusion +/// `RecognitionMethod::Manual` and confidence 1.0. Collects exclusion /// annotations for downstream filtering. pub struct ManualDetection; @@ -54,8 +54,8 @@ impl ManualDetection { for ann in &annotations { match ann.kind { AnnotationKind::Inclusion => { - let category = match &ann.category { - Some(c) => c.clone(), + let category = match ann.category { + Some(c) => c, None => continue, }; let entity_kind = match ann.entity_kind { @@ -65,7 +65,7 @@ impl ManualDetection { let value = ann.value.clone().unwrap_or_default(); let mut entity = - Entity::new(category, entity_kind, value, DetectionMethod::Manual, 1.0); + Entity::new(category, entity_kind, value, RecognitionMethod::Manual, 1.0); entity.location = ann.location.clone(); entities.push(entity); } @@ -128,10 +128,10 @@ mod tests { fn make_entity(value: &str, start: usize, end: usize) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, - DetectionMethod::Manual, + RecognitionMethod::Manual, 1.0, ) .with_location( @@ -199,7 +199,7 @@ mod tests { let annotations = vec![ Annotation { kind: AnnotationKind::Inclusion, - category: Some(EntityCategory::Pii), + category: Some(EntityCategory::PersonalIdentity), entity_kind: Some(EntityKind::PersonName), value: Some("Alice".into()), location: None, diff --git a/crates/nvisy-engine/src/operation/processing/pattern_match.rs b/crates/nvisy-engine/src/operation/processing/pattern_match.rs index 216f94e..4448073 100644 --- a/crates/nvisy-engine/src/operation/processing/pattern_match.rs +++ b/crates/nvisy-engine/src/operation/processing/pattern_match.rs @@ -1,18 +1,14 @@ //! Pattern-based PII/PHI entity detection operation. //! -//! Operates on text, CSV, HTML, and JSON spans, running both compiled +//! Scans type-erased text spans (`Span`) using compiled //! regex patterns and dictionary automata via [`PatternEngine`]. use nvisy_codec::Span; -use nvisy_codec::handler::{CsvSpan, HtmlSpan, JsonPath, TxtSpan}; +use nvisy_codec::handler::TextData; use nvisy_core::{Error, Result}; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, TabularLocation, TextLocation}; -use nvisy_pattern::{ - ContextRule, DetectionSource, PatternEngine, PatternEngineBuilder, - PatternMatch as PatternMatchResult, -}; +use nvisy_ontology::entity::TextLocation; +use nvisy_pattern::{ContextRule, PatternEngine, PatternEngineBuilder, RawMatch}; use serde::Deserialize; -use serde_json::Value; use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; @@ -29,19 +25,13 @@ pub struct PatternDetectionParams { pub patterns: Option>, } -/// Multi-modality input for pattern matching. -pub enum PatternInput { - Text(Vec>), - Csv(Vec>), - - Html(Vec>), - Json(Vec>), -} - /// Pattern detection operation backed by [`PatternEngine`]. /// -/// Handles both regex and dictionary matches, replacing the former -/// separate `DictionaryDetection`. +/// Accepts type-erased text spans from any [`TextHandler`] (plain text, +/// CSV, HTML, JSON, etc.) and detects entities using regex and dictionary +/// patterns with co-occurrence boosting. +/// +/// [`TextHandler`]: nvisy_codec::handler::TextHandler pub struct PatternMatch { engine: PatternEngine, } @@ -59,232 +49,58 @@ impl PatternMatch { .map_err(|e| Error::validation(e.to_string(), "pattern-detection"))?; Ok(Self { engine }) } -} - -impl PatternMatch { - async fn scan(&self, data: PatternInput) -> Result { - tracing::debug!(target: TARGET, "scanning for patterns"); - match data { - PatternInput::Text(spans) => self.detect_text(spans).map(DetectedEntities), - PatternInput::Csv(spans) => self.detect_csv(spans).map(DetectedEntities), - PatternInput::Html(spans) => self.detect_html(spans).map(DetectedEntities), - PatternInput::Json(spans) => self.detect_json(spans).map(DetectedEntities), - } - } -} - -impl Operation for PatternMatch { - type Input = ParallelContext; - type Output = ParallelContext; - - async fn call(&self, input: Self::Input) -> Result { - input.parallel_map(|data| self.scan(data)).await - } -} - -impl PatternMatch { - fn detect_text(&self, spans: Vec>) -> Result { - // Phase 1: collect raw matches per span index. - let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - - for (idx, span) in spans.iter().enumerate() { - for m in self.engine.scan_text(&span.data) { - raw_matches.push((idx, m)); - } - } - - // Phase 2: apply co-occurrence boost and build entities. - let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) - } else { - m.confidence - }; - - let method = detection_method(m.source); - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[*span_idx].id.0.to_string()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[*span_idx].source); - - entities.push(entity); - } - - Ok(entities.into()) - } - - fn detect_csv(&self, spans: Vec>) -> Result { - // Collect all span data (including headers) for co-occurrence window. - let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - - // Phase 1: collect raw matches per span index (skip headers). - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - for (idx, span) in spans.iter().enumerate() { - if span.id.header || span.data.is_empty() { - continue; - } - for m in self.engine.scan_text(&span.data) { - raw_matches.push((idx, m)); - } - } - - // Phase 2: apply co-occurrence boost and build entities. - let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) - } else { - m.confidence - }; - let method = detection_method(m.source); - let span = &spans[*span_idx]; + fn detect(&self, spans: Vec>) -> Result { + tracing::debug!(target: TARGET, span_count = spans.len(), "scanning for patterns"); - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TabularLocation { - row_index: span.id.row, - column_index: span.id.col, - start_offset: Some(m.start), - end_offset: Some(m.end), - column_name: None, - sheet_name: None, - } - .into(), - ) - .with_parent(&span.source); - - entities.push(entity); - } - - Ok(entities.into()) - } - - fn detect_html(&self, spans: Vec>) -> Result { let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); + let mut raw_matches: Vec<(usize, RawMatch)> = Vec::new(); for (idx, span) in spans.iter().enumerate() { - for m in self.engine.scan_text(&span.data) { + for m in self.engine.scan_text(span.data.as_str()) { raw_matches.push((idx, m)); } } let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { + for (span_idx, m) in raw_matches { let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) + apply_cooccurrence(&span_data, span_idx, ctx, m.confidence) } else { m.confidence }; - - let method = detection_method(m.source); - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[*span_idx].id.0.to_string()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[*span_idx].source); + let start = m.start; + let end = m.end; + let element_id = spans[span_idx].id.to_string(); + let source = spans[span_idx].source; + + let mut entity = m.into_entity(); + entity.confidence = confidence; + let entity = entity + .with_location( + TextLocation { + start_offset: start, + end_offset: end, + element_id: Some(element_id), + ..Default::default() + } + .into(), + ) + .with_parent(&source); entities.push(entity); } - Ok(entities.into()) - } - - fn detect_json(&self, spans: Vec>) -> Result { - // Filter to string-valued spans and collect text for co-occurrence. - let string_spans: Vec<(usize, &str)> = spans - .iter() - .enumerate() - .filter_map(|(idx, s)| s.data.as_str().map(|text| (idx, text))) - .collect(); - - let span_data: Vec<&str> = string_spans.iter().map(|(_, text)| *text).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - - for (co_idx, (_, text)) in string_spans.iter().enumerate() { - for m in self.engine.scan_text(text) { - raw_matches.push((co_idx, m)); - } - } - - let mut entities = Vec::new(); - for (co_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *co_idx, ctx, m.confidence) - } else { - m.confidence - }; - - let method = detection_method(m.source); - let (orig_idx, _) = string_spans[*co_idx]; - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[orig_idx].id.pointer.clone()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[orig_idx].source); - - entities.push(entity); - } - - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } -/// Map a [`DetectionSource`] to a [`DetectionMethod`]. -fn detection_method(source: DetectionSource) -> DetectionMethod { - match source { - DetectionSource::Regex => DetectionMethod::Regex, - DetectionSource::Dictionary => DetectionMethod::Dictionary, - DetectionSource::DenyList => DetectionMethod::Dictionary, +impl Operation for PatternMatch { + type Input = ParallelContext>>; + type Output = ParallelContext; + + async fn call(&self, input: Self::Input) -> Result { + input.parallel_map(|data| async { self.detect(data) }).await } } diff --git a/crates/nvisy-engine/src/pipeline/ontology.rs b/crates/nvisy-engine/src/pipeline/ontology.rs index e26765a..fefb150 100644 --- a/crates/nvisy-engine/src/pipeline/ontology.rs +++ b/crates/nvisy-engine/src/pipeline/ontology.rs @@ -1,10 +1,10 @@ //! Explainability metadata for data protection decisions. //! -//! An [`Explanation`] records why an action was taken — which model, rule, +//! An [`Explanation`] records why an action was taken: which model, rule, //! and confidence level were involved. Types that carry this metadata //! implement the [`Explainable`] trait. -use nvisy_ontology::entity::{DetectionMethod, ModelInfo}; +use nvisy_ontology::entity::{ModelInfo, RecognitionMethod}; use schemars::JsonSchema; use semver::Version; use serde::{Deserialize, Serialize}; @@ -31,9 +31,9 @@ pub struct Explanation { /// Detection confidence score. #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option, - /// Detection method used. + /// Recognition method used. #[serde(skip_serializing_if = "Option::is_none")] - pub detection_method: Option, + pub recognition_method: Option, /// Human-readable reason for the action. #[serde(skip_serializing_if = "Option::is_none")] pub reason: Option, diff --git a/crates/nvisy-engine/src/provenance/kind.rs b/crates/nvisy-engine/src/provenance/kind.rs index 16cb602..ef87446 100644 --- a/crates/nvisy-engine/src/provenance/kind.rs +++ b/crates/nvisy-engine/src/provenance/kind.rs @@ -1,6 +1,6 @@ //! Two-level tagged enum discriminating audit entry categories. -use nvisy_ontology::entity::DetectionMethod; +use nvisy_ontology::entity::ExtractionMethod; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -27,16 +27,18 @@ pub enum InferenceKind { } impl InferenceKind { - /// Returns the [`DetectionMethod`] that corresponds to this inference kind. - pub fn detection_method(&self) -> DetectionMethod { + /// Returns the [`ExtractionMethod`] for inference kinds that perform + /// content extraction. Returns `None` for pure recognition or + /// non-extraction operations. + pub fn extraction_method(&self) -> Option { match self { - Self::Ocr(_) => DetectionMethod::Ocr, - Self::Transcription(_) => DetectionMethod::SpeechTranscript, - Self::Ner(_) => DetectionMethod::Ner, - Self::ComputerVision(_) => DetectionMethod::ObjectDetection, - Self::Translation(_) | Self::Classification(_) | Self::Summarization(_) => { - DetectionMethod::ContextualNlp - } + Self::Ocr(_) => Some(ExtractionMethod::OpticalCharacterRecognition), + Self::Transcription(_) => Some(ExtractionMethod::Transcription), + Self::ComputerVision(_) => Some(ExtractionMethod::ObjectDetection), + Self::Ner(_) + | Self::Translation(_) + | Self::Classification(_) + | Self::Summarization(_) => None, } } } diff --git a/crates/nvisy-ontology/src/entity/category.rs b/crates/nvisy-ontology/src/entity/category.rs index 4e39489..5460051 100644 --- a/crates/nvisy-ontology/src/entity/category.rs +++ b/crates/nvisy-ontology/src/entity/category.rs @@ -1,31 +1,55 @@ -//! Shared entity category tag. +//! Broad entity category classification. //! -//! [`EntityCategory`] classifies detected sensitive data into broad -//! categories used by both detection and pattern matching crates. +//! [`EntityCategory`] groups related [`EntityKind`](super::EntityKind) +//! variants into policy-addressable buckets. Policy selectors can +//! target an entire category (e.g. "redact all financial data") without +//! enumerating individual kinds. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use strum::{Display, EnumString}; -/// Category of sensitive data an entity belongs to. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Display, EnumString)] -#[derive(Serialize, Deserialize, JsonSchema)] +/// Broad category of sensitive data. +/// +/// Each [`EntityKind`](super::EntityKind) maps to exactly one category +/// via [`EntityKind::category()`](super::EntityKind::category). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum EntityCategory { - /// Personally Identifiable Information (names, SSNs, addresses, etc.). - Pii, - /// Protected Health Information (HIPAA-regulated data). - Phi, - /// Financial data (credit card numbers, bank accounts, etc.). + /// Personal identity: names, government IDs, dates of birth, and + /// other attributes that directly identify a natural person. + PersonalIdentity, + /// Contact information: email addresses, phone numbers, physical + /// addresses, postal codes, and URLs. + ContactInfo, + /// Demographic attributes: age, gender, ethnicity, religion, + /// nationality, and citizenship. + Demographic, + /// Financial instruments and accounts: payment cards, bank + /// accounts, routing numbers, IBAN, crypto addresses, and + /// monetary amounts. Financial, - /// Secrets and credentials (API keys, passwords, tokens). - Credentials, - /// Legal documents and privileged communications. - Legal, - /// Biometric data (fingerprints, iris scans, voiceprints). + /// Protected health information: medical record numbers, + /// insurance IDs, prescriptions, diagnoses, and medications. + Health, + /// Biometric identifiers: fingerprints, voiceprints, retina + /// scans, and facial geometry templates. Biometric, - /// User-defined or plugin-specific category. - #[strum(default)] - Custom(String), + /// Secrets and credentials: passwords, API keys, authentication + /// tokens, and private cryptographic keys. + Credentials, + /// Network and device identifiers: IP addresses, MAC addresses, + /// device IDs, and usernames. + NetworkIdentifier, + /// Geographic and spatial data: GPS coordinates and geolocation + /// metadata. + Location, + /// Sensitive visual elements detected in images or video: + /// faces, handwriting, signatures, logos, and barcodes. + Visual, + /// Organizational identifiers: company names, departments, + /// facilities, and institutional reference numbers. + Organizational, } diff --git a/crates/nvisy-ontology/src/entity/kind.rs b/crates/nvisy-ontology/src/entity/kind.rs index dbf0a2e..3c74d0a 100644 --- a/crates/nvisy-ontology/src/entity/kind.rs +++ b/crates/nvisy-ontology/src/entity/kind.rs @@ -4,8 +4,9 @@ //! can detect or redact. Each variant maps to a stable `snake_case` //! string for serialization and display. //! -//! Every variant also maps to an [`EntityCategory`] via [`EntityKind::category`] -//! and an [`EntitySensitivity`] via [`EntityKind::sensitivity`]. +//! Every variant also maps to: +//! - an [`EntityCategory`] via [`EntityKind::category`], +//! - an [`EntitySensitivity`] via [`EntityKind::sensitivity`]. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -20,7 +21,11 @@ use super::sensitivity::EntitySensitivity; #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum EntityKind { - // Identity documents: + // Personal identity + /// Person name (full, first, or last). + PersonName, + /// Date of birth. + DateOfBirth, /// Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.). GovernmentId, /// Tax identification number (ITIN, EIN, TIN, etc.). @@ -29,44 +34,42 @@ pub enum EntityKind { DriversLicense, /// Passport number. PassportNumber, + /// National insurance or social-security equivalent (NI, BSN, AHVN, etc.). + NationalInsuranceNumber, /// Vehicle identification number (VIN). VehicleId, /// License plate number. LicensePlate, - // Personal information: - /// Person name (full, first, or last). - PersonName, - /// Date of birth. - DateOfBirth, - /// Age value. - Age, - /// Demographic attribute (gender, race/ethnicity, religion, orientation, etc.). - Demographic, - - // Contact information: + // Contact information /// Email address. EmailAddress, /// Phone number. PhoneNumber, - /// Physical / mailing address. + /// Physical or mailing address. Address, /// Postal or ZIP code. PostalCode, /// URL or hyperlink. Url, - // Network & device identifiers: - /// IP address (v4 or v6). - IpAddress, - /// MAC (hardware) address. - MacAddress, - /// Device identifier (IMEI, IDFA, etc.). - DeviceId, - /// Username or online handle. - Username, - - // Financial: + // Demographic + /// Age value. + Age, + /// Gender identity. + Gender, + /// Racial or ethnic background. + Ethnicity, + /// Religious affiliation. + Religion, + /// Nationality. + Nationality, + /// Citizenship status. + Citizenship, + /// Language or dialect spoken. + Language, + + // Financial /// Payment card number (credit or debit). PaymentCard, /// Payment card security code (CVV/CVC). @@ -75,26 +78,40 @@ pub enum EntityKind { CardExpiry, /// Bank account number. BankAccount, - /// Bank routing / transit number. + /// Bank routing or transit number. BankRouting, /// International Bank Account Number (IBAN). Iban, /// SWIFT / BIC code. SwiftCode, - /// Monetary amount. - Amount, /// Cryptocurrency wallet address. CryptoAddress, + /// Monetary amount. + Amount, - // Health: + // Health /// Medical or patient identifier. MedicalId, /// Insurance policy number. InsuranceId, /// Prescription number. PrescriptionId, + /// Medical diagnosis or condition. + Diagnosis, + /// Drug or medication name in a patient context. + Medication, + + // Biometric + /// Fingerprint template or minutiae data. + Fingerprint, + /// Voiceprint or speaker embedding. + Voiceprint, + /// Retina or iris scan data. + RetinaScan, + /// Facial geometry or face embedding (not a photo: see [`Face`](Self::Face)). + FacialGeometry, - // Credentials: + // Credentials /// Password or passphrase. Password, /// API key. @@ -104,31 +121,23 @@ pub enum EntityKind { /// Private cryptographic key. PrivateKey, - // Biometric: - /// Fingerprint template or minutiae data. - Fingerprint, - /// Voiceprint / speaker embedding. - Voiceprint, - /// Retina or iris scan data. - RetinaScan, - /// Facial geometry / face embedding (not a photo — see [`Face`](Self::Face)). - FacialGeometry, + // Network and device identifiers + /// IP address (v4 or v6). + IpAddress, + /// MAC (hardware) address. + MacAddress, + /// Device identifier (IMEI, IDFA, etc.). + DeviceId, + /// Username or online handle. + Username, - // Location: + // Location /// GPS coordinates (latitude / longitude). Coordinates, /// Geolocation metadata (EXIF, cell tower, etc.). GeolocationMetadata, - // Dates & times: - /// Date and/or time value. - DateTime, - - // Organizations: - /// Company or organisation name. - OrganizationName, - - // Visual / image entities: + // Visual /// Detected human face in an image. Face, /// Handwritten text region. @@ -139,35 +148,54 @@ pub enum EntityKind { Logo, /// Barcode (1D) or QR code (2D). Barcode, + + // Organizational + /// Company or institution name. + OrganizationName, + /// Internal division or department name. + DepartmentName, + /// Physical facility name (hospital, office, school). + FacilityName, + /// Legal or administrative case identifier. + CaseNumber, + /// Internal reference number (invoice, contract, PO, employee number, membership ID). + InternalId, + + // Temporal + /// Date, time, or datetime value. + DateTime, } impl EntityKind { /// Returns the [`EntityCategory`] this entity kind belongs to. pub fn category(&self) -> EntityCategory { match self { - // Identity & personal - Self::GovernmentId + // Personal identity + Self::PersonName + | Self::DateOfBirth + | Self::GovernmentId | Self::TaxId | Self::DriversLicense | Self::PassportNumber + | Self::NationalInsuranceNumber | Self::VehicleId - | Self::LicensePlate - | Self::PersonName - | Self::DateOfBirth - | Self::Age - | Self::Demographic => EntityCategory::Pii, + | Self::LicensePlate => EntityCategory::PersonalIdentity, // Contact Self::EmailAddress | Self::PhoneNumber | Self::Address | Self::PostalCode - | Self::Url => EntityCategory::Pii, + | Self::Url => EntityCategory::ContactInfo, - // Network & device - Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { - EntityCategory::Pii - } + // Demographic + Self::Age + | Self::Gender + | Self::Ethnicity + | Self::Religion + | Self::Nationality + | Self::Citizenship + | Self::Language => EntityCategory::Demographic, // Financial Self::PaymentCard @@ -177,35 +205,50 @@ impl EntityKind { | Self::BankRouting | Self::Iban | Self::SwiftCode - | Self::Amount - | Self::CryptoAddress => EntityCategory::Financial, + | Self::CryptoAddress + | Self::Amount => EntityCategory::Financial, // Health - Self::MedicalId | Self::InsuranceId | Self::PrescriptionId => EntityCategory::Phi, + Self::MedicalId + | Self::InsuranceId + | Self::PrescriptionId + | Self::Diagnosis + | Self::Medication => EntityCategory::Health, + + // Biometric + Self::Fingerprint | Self::Voiceprint | Self::RetinaScan | Self::FacialGeometry => { + EntityCategory::Biometric + } // Credentials Self::Password | Self::ApiKey | Self::AuthToken | Self::PrivateKey => { EntityCategory::Credentials } - // Biometric - Self::Fingerprint - | Self::Voiceprint - | Self::RetinaScan - | Self::FacialGeometry - | Self::Face => EntityCategory::Biometric, + // Network + Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { + EntityCategory::NetworkIdentifier + } // Location - Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Pii, - - // Dates & times - Self::DateTime => EntityCategory::Pii, + Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Location, - // Organizations - Self::OrganizationName => EntityCategory::Pii, + // Visual + Self::Face | Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => { + EntityCategory::Visual + } - // Visual / image - Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => EntityCategory::Pii, + // Organizational + Self::OrganizationName + | Self::DepartmentName + | Self::FacilityName + | Self::CaseNumber + | Self::InternalId => EntityCategory::Organizational, + + // Temporal (grouped under PersonalIdentity: bare dates most + // commonly appear alongside personal data and are regulated + // as PII by GDPR/CCPA) + Self::DateTime => EntityCategory::PersonalIdentity, } } @@ -215,6 +258,7 @@ impl EntityKind { // Critical: irrevocable identifiers, secrets, biometrics Self::GovernmentId | Self::PassportNumber + | Self::NationalInsuranceNumber | Self::PaymentCard | Self::CardSecurityCode | Self::BankAccount @@ -238,33 +282,46 @@ impl EntityKind { | Self::MedicalId | Self::InsuranceId | Self::PrescriptionId + | Self::Diagnosis + | Self::Medication | Self::Iban | Self::CryptoAddress | Self::Face - | Self::Signature => EntitySensitivity::High, + | Self::Signature + | Self::Coordinates => EntitySensitivity::High, // Medium: indirectly identifying Self::Age - | Self::Demographic + | Self::Gender + | Self::Ethnicity + | Self::Religion + | Self::Nationality + | Self::Citizenship + | Self::Language | Self::PostalCode | Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username - | Self::Coordinates - | Self::GeolocationMetadata | Self::CardExpiry | Self::BankRouting | Self::SwiftCode | Self::VehicleId | Self::LicensePlate + | Self::GeolocationMetadata | Self::DateTime - | Self::Handwriting => EntitySensitivity::Medium, + | Self::Handwriting + | Self::CaseNumber + | Self::InternalId => EntitySensitivity::Medium, - // Low: quasi-public - Self::Url | Self::Amount | Self::OrganizationName | Self::Logo | Self::Barcode => { - EntitySensitivity::Low - } + // Low: quasi-public or context-dependent + Self::Url + | Self::Amount + | Self::OrganizationName + | Self::DepartmentName + | Self::FacilityName + | Self::Logo + | Self::Barcode => EntitySensitivity::Low, } } } @@ -302,10 +359,38 @@ mod tests { } #[test] - fn category_pii() { - assert_eq!(EntityKind::GovernmentId.category(), EntityCategory::Pii); - assert_eq!(EntityKind::PersonName.category(), EntityCategory::Pii); - assert_eq!(EntityKind::Address.category(), EntityCategory::Pii); + fn category_personal_identity() { + assert_eq!( + EntityKind::GovernmentId.category(), + EntityCategory::PersonalIdentity + ); + assert_eq!( + EntityKind::PersonName.category(), + EntityCategory::PersonalIdentity + ); + assert_eq!( + EntityKind::DateOfBirth.category(), + EntityCategory::PersonalIdentity + ); + } + + #[test] + fn category_contact_info() { + assert_eq!( + EntityKind::EmailAddress.category(), + EntityCategory::ContactInfo + ); + assert_eq!(EntityKind::Address.category(), EntityCategory::ContactInfo); + } + + #[test] + fn category_demographic() { + assert_eq!(EntityKind::Gender.category(), EntityCategory::Demographic); + assert_eq!( + EntityKind::Ethnicity.category(), + EntityCategory::Demographic + ); + assert_eq!(EntityKind::Religion.category(), EntityCategory::Demographic); } #[test] @@ -318,23 +403,16 @@ mod tests { } #[test] - fn category_phi() { - assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Phi); - assert_eq!(EntityKind::PrescriptionId.category(), EntityCategory::Phi); + fn category_health() { + assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Health); + assert_eq!(EntityKind::Diagnosis.category(), EntityCategory::Health); + assert_eq!(EntityKind::Medication.category(), EntityCategory::Health); } #[test] fn category_credentials() { assert_eq!(EntityKind::Password.category(), EntityCategory::Credentials); assert_eq!(EntityKind::ApiKey.category(), EntityCategory::Credentials); - assert_eq!( - EntityKind::AuthToken.category(), - EntityCategory::Credentials - ); - assert_eq!( - EntityKind::PrivateKey.category(), - EntityCategory::Credentials - ); } #[test] @@ -345,11 +423,23 @@ mod tests { ); assert_eq!(EntityKind::Voiceprint.category(), EntityCategory::Biometric); assert_eq!(EntityKind::RetinaScan.category(), EntityCategory::Biometric); + assert_eq!(EntityKind::Face.category(), EntityCategory::Visual); + } + + #[test] + fn category_organizational() { assert_eq!( - EntityKind::FacialGeometry.category(), - EntityCategory::Biometric + EntityKind::OrganizationName.category(), + EntityCategory::Organizational + ); + assert_eq!( + EntityKind::CaseNumber.category(), + EntityCategory::Organizational + ); + assert_eq!( + EntityKind::InternalId.category(), + EntityCategory::Organizational ); - assert_eq!(EntityKind::Face.category(), EntityCategory::Biometric); } #[test] @@ -383,6 +473,7 @@ mod tests { EntitySensitivity::High ); assert_eq!(EntityKind::MedicalId.sensitivity(), EntitySensitivity::High); + assert_eq!(EntityKind::Diagnosis.sensitivity(), EntitySensitivity::High); } #[test] diff --git a/crates/nvisy-ontology/src/entity/method.rs b/crates/nvisy-ontology/src/entity/method.rs index 028664d..42839d4 100644 --- a/crates/nvisy-ontology/src/entity/method.rs +++ b/crates/nvisy-ontology/src/entity/method.rs @@ -1,35 +1,146 @@ -//! Detection method classification. +//! Extraction, recognition, and refinement method classification. +//! +//! These enums form the provenance record for every detected entity, +//! documenting how content was extracted from its source modality, +//! how sensitive data was identified, and what post-detection +//! refinements were applied. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use strum::{Display, EnumString}; -/// Method used to detect a sensitive entity. +/// How content was extracted from its source modality into analyzable form. +/// +/// Each variant names the technique that transformed raw content +/// (image pixels, audio samples, binary file formats) into a +/// representation suitable for entity recognition. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] -pub enum DetectionMethod { - /// Regular expression pattern matching. +pub enum ExtractionMethod { + // Text + /// Structural parsing of document formats (PDF, DOCX, HTML) + /// into text and layout primitives. + DocumentParsing, + /// Inference of field semantics from column names, types, or + /// positional conventions in tabular data. + SchemaInference, + + // Image / Video + /// Optical character recognition: converts raster text + /// (printed or handwritten) into machine-readable characters. + OpticalCharacterRecognition, + /// Object detection: locates and labels regions of interest + /// within an image or video frame. + ObjectDetection, + /// Scene text detection: localises text embedded in natural + /// images (signs, screens, whiteboards) prior to OCR. + SceneTextDetection, + /// Table extraction: recovers row/column structure from images + /// or scanned PDFs, preserving cell relationships that plain + /// OCR loses. + TableExtraction, + /// Document layout analysis: identifies structural regions + /// (headers, footers, signature blocks, form fields) by spatial + /// arrangement rather than content. + LayoutAnalysis, + /// Metadata extraction: reads EXIF, PDF properties, or other + /// embedded metadata that may contain PII (author, GPS, device info). + MetadataExtraction, + /// Frame extraction: samples individual frames from video + /// streams for downstream image analysis. + FrameExtraction, + + // Audio / Video + /// Speech-to-text transcription: converts audio into text. + Transcription, + /// Speaker diarization: segments audio by speaker identity + /// to attribute utterances before recognition. + Diarization, +} + +/// Technique used to identify a sensitive entity within extracted content. +/// +/// Each variant names a self-contained recognition strategy. +/// An entity's `recognition_methods` vector records every technique +/// that contributed to its identification, ordered by application time. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum RecognitionMethod { + // Pattern + /// Regular expression matching against known PII formats. Regex, - /// Lookup in a known-value dictionary. + /// Mathematical validation of a candidate value + /// (Luhn, IBAN mod-97, SSN area rules). + Checksum, + /// Exact-match lookup in a curated value list. Dictionary, - /// Named-entity recognition via AI model. + /// Co-occurrence analysis: keywords near a candidate raise or + /// lower confidence (e.g. "SSN" adjacent to a 9-digit number). + ContextualAnalysis, + /// Format heuristics: entropy, character distribution, or + /// structural cues that suggest a value is sensitive without + /// an explicit regex. + Heuristic, + + // Model + /// Named-entity recognition via language model. Ner, - /// Contextual NLP analysis (discourse-level understanding). - ContextualNlp, - /// OCR text extraction with bounding boxes. - Ocr, - /// Face detection in images. - FaceDetection, - /// Object detection in images. - ObjectDetection, - /// Entity detection from speech transcription. - SpeechTranscript, - /// Speaker-identified audio segment for redaction. - SpeakerRedaction, - /// Multiple methods combined to produce a single detection. - Composite, - /// User-provided annotations. + /// Document or field-level classification + /// (e.g. "this column contains SSNs"). + Classification, + /// Semantic similarity search via vector embeddings. + Embedding, + /// Matching extracted values against an external identity or + /// record database. + CrossReference, + + // Biometric + /// Biometric identification: face recognition, voiceprint + /// matching, or other physiological/behavioral trait analysis. + Biometric, + + // Human + /// User-provided annotation. Manual, } + +/// Post-detection refinement applied to an entity before final output. +/// +/// Refinement methods do not discover new entities: they adjust +/// confidence, merge duplicates, or verify existing detections. +/// Recorded on the entity to explain why its final state may differ +/// from the initial detection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum RefinementMethod { + /// Cross-detector deduplication: merges overlapping entities + /// from independent detectors, combining their confidence and + /// attribution. + Deduplication, + /// Ensemble fusion: combines confidence scores from multiple + /// detectors using a voting or averaging strategy. + EnsembleFusion, + /// Model-based verification: a secondary model (typically VLM) + /// reviews detections against source content to confirm, correct, + /// or reject. + ModelVerification, + /// Policy evaluation: applies business rules, thresholds, or + /// per-category overrides to filter or re-score detections. + PolicyEvaluation, + /// Human review: a reviewer confirmed, corrected, or rejected + /// the detection. + HumanReview, + /// Confidence calibration: adjusts raw model scores to align + /// with empirical precision targets. + ConfidenceCalibration, + /// Contextual promotion/demotion: surrounding document context + /// upgrades or downgrades an entity's confidence after initial + /// detection. + ContextualAdjustment, +} diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index 84c9404..6fa6f6e 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -13,20 +13,21 @@ mod model; mod output; mod sensitivity; -pub use annotation::{Annotation, AnnotationKind, AnnotationLabel, AnnotationScope}; -pub use category::EntityCategory; use derive_more::{Deref, DerefMut, From, IntoIterator}; -pub use kind::EntityKind; -pub use location::{AudioLocation, ImageLocation, Location, TabularLocation, TextLocation}; -pub use method::DetectionMethod; -pub use model::{ModelInfo, ModelKind}; use nvisy_core::content::ContentSource; -pub use output::DetectionOutput; use schemars::JsonSchema; -pub use sensitivity::EntitySensitivity; use serde::{Deserialize, Serialize}; use uuid::Uuid; +pub use self::annotation::{Annotation, AnnotationKind, AnnotationLabel, AnnotationScope}; +pub use self::category::EntityCategory; +pub use self::kind::EntityKind; +pub use self::location::{AudioLocation, ImageLocation, Location, TabularLocation, TextLocation}; +pub use self::method::{ExtractionMethod, RecognitionMethod, RefinementMethod}; +pub use self::model::{ModelInfo, ModelKind}; +pub use self::output::DetectionOutput; +pub use self::sensitivity::EntitySensitivity; + /// A detected sensitive data occurrence within a document. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] @@ -40,8 +41,14 @@ pub struct Entity { pub entity_kind: EntityKind, /// The matched text or value. pub value: String, - /// How this entity was detected. - pub detection_method: DetectionMethod, + /// How content was extracted from its source modality, ordered by application time. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub extraction_methods: Vec, + /// Techniques used to identify this entity, ordered by application time. + pub recognition_methods: Vec, + /// Post-detection refinements applied to this entity, ordered by application time. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub refinement_methods: Vec, /// Detection confidence score in the range `[0.0, 1.0]`. pub confidence: f64, /// Modality-specific location of the entity. @@ -61,12 +68,15 @@ impl Entity { self.source.as_uuid() } - /// Create a new entity with the given detection details. + /// Create a new entity with the given recognition method and confidence. + /// + /// The `category` is derived from `entity_kind` via + /// [`EntityKind::category()`] when not supplied explicitly. pub fn new( category: EntityCategory, entity_kind: EntityKind, value: impl Into, - detection_method: DetectionMethod, + recognition_method: RecognitionMethod, confidence: f64, ) -> Self { Self { @@ -74,7 +84,9 @@ impl Entity { category, entity_kind, value: value.into(), - detection_method, + extraction_methods: Vec::new(), + recognition_methods: vec![recognition_method], + refinement_methods: Vec::new(), confidence, location: None, language: None, @@ -82,6 +94,22 @@ impl Entity { } } + /// Create a new entity, deriving the category from the entity kind. + pub fn from_kind( + entity_kind: EntityKind, + value: impl Into, + recognition_method: RecognitionMethod, + confidence: f64, + ) -> Self { + Self::new( + entity_kind.category(), + entity_kind, + value, + recognition_method, + confidence, + ) + } + /// Set the modality-specific location on this entity. pub fn with_location(mut self, location: Location) -> Self { self.location = Some(location); @@ -151,11 +179,20 @@ impl Entities { .collect() } - /// Retain only entities matching the given detection method. - pub fn by_method(&self, method: DetectionMethod) -> Self { + /// Retain only entities that were recognised (at least partly) by the given method. + pub fn by_recognition_method(&self, method: RecognitionMethod) -> Self { + self.0 + .iter() + .filter(|e| e.recognition_methods.contains(&method)) + .cloned() + .collect() + } + + /// Retain only entities whose content was extracted by the given method. + pub fn by_extraction_method(&self, method: ExtractionMethod) -> Self { self.0 .iter() - .filter(|e| e.detection_method == method) + .filter(|e| e.extraction_methods.contains(&method)) .cloned() .collect() } diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index ee38370..6da6ebb 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -17,7 +17,7 @@ Detection runs in three phases: dictionary are injected as synthetic matches with confidence `1.0`. Allow-list filtering is applied inline during phases 1 and 2. All three phases -feed into a unified `Vec`. +feed into a unified `Vec`. ### Pattern JSON schema @@ -27,7 +27,7 @@ Patterns are JSON definition files embedded at compile time from ```json { "name": "ssn", - "category": "pii", + "category": "personal_identity", "entity_type": "government_id", "pattern": { "regex": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", @@ -97,7 +97,7 @@ let allow = AllowList::new() .with("000-00-0000"); let deny = DenyList::new() - .with("John Doe", EntityCategory::Pii, EntityKind::PersonName); + .with("John Doe", EntityCategory::PersonalIdentity, EntityKind::PersonName); let engine = PatternEngine::builder() .with_allow(allow) @@ -109,8 +109,7 @@ let engine = PatternEngine::builder() are silently dropped during `scan_text`. - **Deny list** (`DenyList`): if a deny-list value is found in the text but was not matched by any regex or dictionary pattern, it is injected as a - synthetic `PatternMatch` with confidence `1.0` and source - `DetectionSource::DenyList`. + synthetic `RawMatch` with confidence `1.0` and `pattern_name: None`. Both types implement `FromIterator` for easy construction from iterators. diff --git a/crates/nvisy-pattern/assets/patterns/date_of_birth.json b/crates/nvisy-pattern/assets/patterns/date_of_birth.json index 26ecd52..18ff605 100644 --- a/crates/nvisy-pattern/assets/patterns/date_of_birth.json +++ b/crates/nvisy-pattern/assets/patterns/date_of_birth.json @@ -1,6 +1,6 @@ { "name": "date-of-birth", - "category": "pii", + "category": "personal_identity", "entity_type": "date_of_birth", "pattern": { "regex": "\\b(?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[/\\-](?:19|20)\\d{2}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/email.json b/crates/nvisy-pattern/assets/patterns/email.json index 8748538..eee1fb7 100644 --- a/crates/nvisy-pattern/assets/patterns/email.json +++ b/crates/nvisy-pattern/assets/patterns/email.json @@ -1,6 +1,6 @@ { "name": "email", - "category": "pii", + "category": "contact_info", "entity_type": "email_address", "pattern": { "regex": "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/ipv4.json b/crates/nvisy-pattern/assets/patterns/ipv4.json index 971ccd9..c635823 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv4.json +++ b/crates/nvisy-pattern/assets/patterns/ipv4.json @@ -1,6 +1,6 @@ { "name": "ipv4", - "category": "pii", + "category": "network_identifier", "entity_type": "ip_address", "pattern": { "regex": "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b", diff --git a/crates/nvisy-pattern/assets/patterns/ipv6.json b/crates/nvisy-pattern/assets/patterns/ipv6.json index ce096fd..82e7f20 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv6.json +++ b/crates/nvisy-pattern/assets/patterns/ipv6.json @@ -1,6 +1,6 @@ { "name": "ipv6", - "category": "pii", + "category": "network_identifier", "entity_type": "ip_address", "pattern": { "regex": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/languages.json b/crates/nvisy-pattern/assets/patterns/languages.json index 5d460af..9b4d52e 100644 --- a/crates/nvisy-pattern/assets/patterns/languages.json +++ b/crates/nvisy-pattern/assets/patterns/languages.json @@ -1,7 +1,7 @@ { "name": "languages", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "language", "dictionary": { "name": "languages", "confidence": [0.85, 0.45] diff --git a/crates/nvisy-pattern/assets/patterns/mac_address.json b/crates/nvisy-pattern/assets/patterns/mac_address.json index fd8fe8e..8d62b60 100644 --- a/crates/nvisy-pattern/assets/patterns/mac_address.json +++ b/crates/nvisy-pattern/assets/patterns/mac_address.json @@ -1,6 +1,6 @@ { "name": "mac-address", - "category": "pii", + "category": "network_identifier", "entity_type": "mac_address", "pattern": { "regex": "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/nationalities.json b/crates/nvisy-pattern/assets/patterns/nationalities.json index a32593c..bec7c86 100644 --- a/crates/nvisy-pattern/assets/patterns/nationalities.json +++ b/crates/nvisy-pattern/assets/patterns/nationalities.json @@ -1,7 +1,7 @@ { "name": "nationalities", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "nationality", "dictionary": { "name": "nationalities", "confidence": 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/phone.json b/crates/nvisy-pattern/assets/patterns/phone.json index 5380e94..484cc5b 100644 --- a/crates/nvisy-pattern/assets/patterns/phone.json +++ b/crates/nvisy-pattern/assets/patterns/phone.json @@ -1,6 +1,6 @@ { "name": "phone", - "category": "pii", + "category": "contact_info", "entity_type": "phone_number", "pattern": { "regex": "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/religions.json b/crates/nvisy-pattern/assets/patterns/religions.json index bb3d2f2..cf03849 100644 --- a/crates/nvisy-pattern/assets/patterns/religions.json +++ b/crates/nvisy-pattern/assets/patterns/religions.json @@ -1,7 +1,7 @@ { "name": "religions", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "religion", "dictionary": { "name": "religions", "confidence": 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/ssn.json b/crates/nvisy-pattern/assets/patterns/ssn.json index 12aeb75..21c887f 100644 --- a/crates/nvisy-pattern/assets/patterns/ssn.json +++ b/crates/nvisy-pattern/assets/patterns/ssn.json @@ -1,6 +1,6 @@ { "name": "ssn", - "category": "pii", + "category": "personal_identity", "entity_type": "government_id", "pattern": { "regex": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", diff --git a/crates/nvisy-pattern/assets/patterns/url.json b/crates/nvisy-pattern/assets/patterns/url.json index d7bebc5..6e9907f 100644 --- a/crates/nvisy-pattern/assets/patterns/url.json +++ b/crates/nvisy-pattern/assets/patterns/url.json @@ -1,6 +1,6 @@ { "name": "url", - "category": "pii", + "category": "contact_info", "entity_type": "url", "pattern": { "regex": "\\bhttps?://[^\\s/$.?#][^\\s]*\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json index 1c1709a..fc39bdc 100644 --- a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json +++ b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json @@ -1,6 +1,6 @@ { "name": "us-drivers-license", - "category": "pii", + "category": "personal_identity", "entity_type": "drivers_license", "pattern": { "regex": "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_passport.json b/crates/nvisy-pattern/assets/patterns/us_passport.json index bf055a8..0e0c5c5 100644 --- a/crates/nvisy-pattern/assets/patterns/us_passport.json +++ b/crates/nvisy-pattern/assets/patterns/us_passport.json @@ -1,6 +1,6 @@ { "name": "us-passport", - "category": "pii", + "category": "personal_identity", "entity_type": "passport_number", "pattern": { "regex": "\\b[A-Z]\\d{8}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_postal_code.json b/crates/nvisy-pattern/assets/patterns/us_postal_code.json index b626956..33c7c8d 100644 --- a/crates/nvisy-pattern/assets/patterns/us_postal_code.json +++ b/crates/nvisy-pattern/assets/patterns/us_postal_code.json @@ -1,6 +1,6 @@ { "name": "us-postal-code", - "category": "pii", + "category": "contact_info", "entity_type": "postal_code", "pattern": { "regex": "\\b\\d{5}(?:-\\d{4})?\\b", diff --git a/crates/nvisy-rig/src/agent/base/response.rs b/crates/nvisy-rig/src/agent/base/response.rs index a37ff3b..ddb12a2 100644 --- a/crates/nvisy-rig/src/agent/base/response.rs +++ b/crates/nvisy-rig/src/agent/base/response.rs @@ -114,7 +114,7 @@ mod tests { #[test] fn parse_json_raw_array() { - let text = r#"[{"category":"pii","entity_type":"email_address","value":"a@b.com","confidence":0.9,"start_offset":0,"end_offset":7}]"#; + let text = r#"[{"category":"contact_info","entity_type":"email_address","value":"a@b.com","confidence":0.9,"start_offset":0,"end_offset":7}]"#; let result = ResponseParser::from_text(text) .parse_json::>() .unwrap(); @@ -123,7 +123,7 @@ mod tests { #[test] fn parse_json_fenced() { - let text = "```json\n[{\"category\":\"pii\",\"entity_type\":\"email_address\",\"value\":\"a@b.com\",\"confidence\":0.9}]\n```"; + let text = "```json\n[{\"category\":\"contact_info\",\"entity_type\":\"email_address\",\"value\":\"a@b.com\",\"confidence\":0.9}]\n```"; let result = ResponseParser::from_text(text) .parse_json::>() .unwrap(); diff --git a/crates/nvisy-rig/src/agent/cv/prompt.rs b/crates/nvisy-rig/src/agent/cv/prompt.rs index 513c31f..eade555 100644 --- a/crates/nvisy-rig/src/agent/cv/prompt.rs +++ b/crates/nvisy-rig/src/agent/cv/prompt.rs @@ -52,15 +52,14 @@ You have access to a computer vision tool that detects faces, license plates, an \n\ Your workflow:\n\ 1. Use the cv_detect_objects tool to detect objects in the provided image.\n\ -2. Analyze the detections and classify each into an entity category (pii, phi, etc.) \ - and specific entity type.\n\ +2. Analyze the detections and classify each into an entity category and specific entity type.\n\ 3. Return a JSON array of detected entities, each with keys: \ category, entity_type, label, confidence, bbox ([x, y, width, height] in pixels).\n\ \n\ Common entity mappings:\n\ -- face → category: pii, entity_type: biometric_data\n\ -- license_plate → category: pii, entity_type: vehicle_id\n\ -- signature → category: pii, entity_type: biometric_data\n\ -- handwriting → category: pii, entity_type: person_name (if it contains a name)\n\ +- face → category: biometric, entity_type: face\n\ +- license_plate → category: personal_identity, entity_type: vehicle_registration\n\ +- signature → category: biometric, entity_type: signature\n\ +- handwriting → category: personal_identity, entity_type: person_name (if it contains a name)\n\ \n\ If no objects are detected, return an empty array []."; diff --git a/crates/nvisy-rig/src/agent/ocr/output.rs b/crates/nvisy-rig/src/agent/ocr/output.rs index 3e754e9..c70d27e 100644 --- a/crates/nvisy-rig/src/agent/ocr/output.rs +++ b/crates/nvisy-rig/src/agent/ocr/output.rs @@ -3,7 +3,10 @@ use std::collections::HashMap; use nvisy_core::math::BoundingBox; -use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityKind, ImageLocation}; +use nvisy_ontology::entity::{ + Entity, EntityCategory, EntityKind, ExtractionMethod, ImageLocation, RecognitionMethod, + RefinementMethod, +}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -58,9 +61,13 @@ impl VerifiedEntity { self.category.unwrap_or(entity.category), self.entity_type.unwrap_or(entity.entity_kind), self.value.as_deref().unwrap_or(&entity.value), - DetectionMethod::Ocr, + RecognitionMethod::Ner, self.confidence, ); + corrected.extraction_methods = vec![ExtractionMethod::OpticalCharacterRecognition]; + corrected + .refinement_methods + .push(RefinementMethod::ModelVerification); corrected.source = entity.source; if let Some(bbox) = self.bbox { From 0b7e27f570acb69f46a0dd152d27a466a6b26d5e Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 18:21:51 +0100 Subject: [PATCH 04/10] refactor(pattern): review and improve all nvisy-pattern modules Dictionaries: add filesystem loading (from_path, load_file, load_dir), DictionaryLoadError type, make module public with registry re-exports. Validators: expand Luhn tests, improve module docs. Patterns: add tracing target constants, improve context rule docs. Engine: fix scan_deny_list borrow issue with HashSet, add assert guard for empty recognition_methods, use BTreeMap in DenyList for determinism, add tracing targets and field recording throughout. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + crates/nvisy-pattern/Cargo.toml | 3 + .../src/dictionaries/csv_dictionary.rs | 68 ++++- crates/nvisy-pattern/src/dictionaries/mod.rs | 232 ++++++++++++++++-- .../src/dictionaries/text_dictionary.rs | 19 ++ crates/nvisy-pattern/src/engine/builder.rs | 18 +- crates/nvisy-pattern/src/engine/deny_list.rs | 57 +++-- crates/nvisy-pattern/src/engine/mod.rs | 173 ++++++++----- .../nvisy-pattern/src/engine/pattern_match.rs | 55 +++-- crates/nvisy-pattern/src/lib.rs | 7 +- .../src/patterns/context_rule.rs | 40 ++- .../src/patterns/json_pattern.rs | 24 +- crates/nvisy-pattern/src/patterns/mod.rs | 64 +++-- crates/nvisy-pattern/src/patterns/pattern.rs | 41 +++- crates/nvisy-pattern/src/prelude.rs | 5 +- crates/nvisy-pattern/src/validators/luhn.rs | 52 +++- crates/nvisy-pattern/src/validators/mod.rs | 6 +- 17 files changed, 650 insertions(+), 215 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab4a72a..37cd3b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3045,6 +3045,7 @@ dependencies = [ "regex", "serde", "serde_json", + "tempfile", "thiserror 2.0.18", "tracing", ] diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 18ebacb..34ce049 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -44,3 +44,6 @@ aho-corasick = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs index 96e55a9..1958ad8 100644 --- a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs @@ -1,7 +1,17 @@ //! CSV dictionary: one row per entity, each cell is a matchable variant. +use std::path::Path; + use super::Dictionary; +/// Error returned when a CSV dictionary cannot be parsed. +#[derive(Debug, thiserror::Error)] +#[error("failed to parse CSV record in dictionary '{name}': {source}")] +pub struct CsvDictionaryError { + name: String, + source: csv::Error, +} + /// A dictionary parsed from a CSV file. /// /// Each row may contain multiple columns (e.g. name, symbol, code). @@ -21,7 +31,11 @@ impl CsvDictionary { /// `text` is the CSV content where each non-empty cell becomes a matchable term. /// The column index of each cell is preserved so that per-column confidence /// scores can be applied at detection time. - pub fn new(name: impl Into, text: &str) -> Self { + /// + /// # Errors + /// + /// Returns [`CsvDictionaryError`] if any CSV record cannot be parsed. + pub fn new(name: impl Into, text: &str) -> Result { let name = name.into(); let mut entries = Vec::new(); @@ -33,21 +47,49 @@ impl CsvDictionary { .from_reader(text.as_bytes()); for result in reader.records() { - let record = result.expect("failed to parse CSV record"); + let record = result.map_err(|source| CsvDictionaryError { + name: name.clone(), + source, + })?; for (col, field) in record.iter().enumerate() { - let trimmed = field.trim(); - if !trimmed.is_empty() { - entries.push(trimmed.to_owned()); + if !field.is_empty() { + entries.push(field.to_owned()); columns.push(col); } } } - Self { + Ok(Self { name, entries, columns, - } + }) + } + + /// Load a CSV dictionary from a file path. + /// + /// The dictionary name is derived from the file stem. + /// + /// # Errors + /// + /// Returns [`DictionaryLoadError`](super::DictionaryLoadError) if the + /// file cannot be read or the CSV content cannot be parsed. + pub fn from_path(path: impl AsRef) -> Result { + let path = path.as_ref(); + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or_default(); + let text = std::fs::read_to_string(path).map_err(|source| { + super::DictionaryLoadError::ReadFile { + path: path.to_owned(), + source, + } + })?; + Self::new(name, &text).map_err(|source| super::DictionaryLoadError::CsvParse { + path: path.to_owned(), + source, + }) } } @@ -71,20 +113,26 @@ mod tests { #[test] fn parses_rows_with_variants() { - let dict = CsvDictionary::new("test", "US Dollar,USD\nEuro,EUR\n"); + let dict = CsvDictionary::new("test", "US Dollar,USD\nEuro,EUR\n").unwrap(); assert_eq!(dict.name(), "test"); assert_eq!(dict.entries(), &["US Dollar", "USD", "Euro", "EUR"]); } #[test] fn handles_variable_columns() { - let dict = CsvDictionary::new("test", "a,b,c\nd,e\n"); + let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); assert_eq!(dict.entries(), &["a", "b", "c", "d", "e"]); } #[test] fn skips_empty_fields() { - let dict = CsvDictionary::new("test", "a,,b\n"); + let dict = CsvDictionary::new("test", "a,,b\n").unwrap(); assert_eq!(dict.entries(), &["a", "b"]); } + + #[test] + fn column_indices_are_tracked() { + let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); + assert_eq!(dict.columns(), Some([0, 1, 2, 0, 1].as_slice())); + } } diff --git a/crates/nvisy-pattern/src/dictionaries/mod.rs b/crates/nvisy-pattern/src/dictionaries/mod.rs index 6c5d5ba..2d94883 100644 --- a/crates/nvisy-pattern/src/dictionaries/mod.rs +++ b/crates/nvisy-pattern/src/dictionaries/mod.rs @@ -25,19 +25,48 @@ mod dictionary; mod text_dictionary; use std::collections::BTreeMap; +use std::path::Path; use std::sync::LazyLock; -pub use csv_dictionary::CsvDictionary; -pub use dictionary::{BoxDictionary, Dictionary}; use include_dir::{Dir, include_dir}; -pub use text_dictionary::TxtDictionary; + +pub use self::csv_dictionary::{CsvDictionary, CsvDictionaryError}; +pub use self::dictionary::{BoxDictionary, Dictionary}; +pub use self::text_dictionary::TxtDictionary; + +const TARGET: &str = "nvisy_pattern::dictionaries"; + +/// Error returned when loading dictionaries from a filesystem directory. +#[derive(Debug, thiserror::Error)] +pub enum DictionaryLoadError { + /// The directory could not be read. + #[error("failed to read dictionary directory '{}': {source}", path.display())] + ReadDir { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A dictionary file could not be read. + #[error("failed to read dictionary file '{}': {source}", path.display())] + ReadFile { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A CSV dictionary file failed to parse. + #[error("failed to parse CSV dictionary '{}': {source}", path.display())] + CsvParse { + path: std::path::PathBuf, + source: CsvDictionaryError, + }, +} /// A registry of named [`Dictionary`] instances with O(log n) lookup. /// /// Use [`load_builtins`] to create a registry pre-populated with -/// the compile-time-embedded dictionary files. +/// the compile-time-embedded dictionary files, or [`load_dir`] to +/// load from a filesystem directory at runtime. /// /// [`load_builtins`]: Self::load_builtins +/// [`load_dir`]: Self::load_dir pub struct DictionaryRegistry { inner: BTreeMap, } @@ -55,9 +84,7 @@ impl std::fmt::Debug for DictionaryRegistry { impl DictionaryRegistry { /// Create an empty registry. pub fn new() -> Self { - Self { - inner: BTreeMap::new(), - } + Self::default() } /// Insert a dictionary, keyed by its [`Dictionary::name`]. @@ -72,22 +99,36 @@ impl DictionaryRegistry { self.inner.get(name).map(|b| b.as_ref()) } + /// Iterate over all registered dictionaries as `(name, &dyn Dictionary)` pairs. + pub fn iter(&self) -> impl Iterator { + self.inner.iter().map(|(k, v)| (k.as_str(), v.as_ref())) + } + + /// Iterate over all registered dictionary names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) + } + /// Total number of registered dictionaries. #[must_use] pub fn len(&self) -> usize { self.inner.len() } + /// Whether the registry contains no dictionaries. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + /// Load all `.txt` and `.csv` files from the embedded - /// `assets/dictionaries/` directory and return a populated registry. + /// `assets/dictionaries/` directory into this registry. /// /// Unrecognised file extensions are logged as warnings and skipped. - #[tracing::instrument(name = "dictionaries.load_builtins", fields(count))] - pub fn load_builtins() -> Self { + #[tracing::instrument(target = TARGET, name = "dictionaries.load_builtins", skip(self), fields(count))] + pub fn load_builtins(&mut self) { static DICT_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/dictionaries"); - let mut reg = Self::new(); - for file in DICT_DIR.files() { let path = file.path(); let text = file @@ -101,9 +142,13 @@ impl DictionaryRegistry { let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { Some("txt") => Box::new(TxtDictionary::new(name.as_ref(), text)), - Some("csv") => Box::new(CsvDictionary::new(name.as_ref(), text)), + Some("csv") => Box::new( + CsvDictionary::new(name.as_ref(), text) + .expect("built-in CSV dictionary must parse"), + ), other => { tracing::warn!( + target: TARGET, path = %path.display(), extension = ?other, "skipping unrecognised dictionary file", @@ -113,27 +158,119 @@ impl DictionaryRegistry { }; tracing::trace!( + target: TARGET, name = dict.name(), entries = dict.entries().len(), "dictionary loaded", ); - reg.insert(dict); + self.insert(dict); } - tracing::Span::current().record("count", reg.len()); - tracing::debug!("built-in dictionaries loaded"); - reg + tracing::Span::current().record("count", self.len()); + tracing::debug!(target: TARGET, "built-in dictionaries loaded"); + } + + /// Load a single `.txt` or `.csv` dictionary file and insert it. + /// + /// The dictionary name is derived from the file stem. + /// Files with unrecognised extensions are logged as warnings and + /// ignored (no error is returned). + /// + /// # Errors + /// + /// Returns [`DictionaryLoadError`] if the file cannot be read or + /// a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_file", skip_all, fields(path = %path.as_ref().display()))] + pub fn load_file(&mut self, path: impl AsRef) -> Result<(), DictionaryLoadError> { + let path = path.as_ref(); + + let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { + Some("txt") => { + let d = TxtDictionary::from_path(path).map_err(|source| { + DictionaryLoadError::ReadFile { + path: path.to_owned(), + source, + } + })?; + Box::new(d) + } + Some("csv") => Box::new(CsvDictionary::from_path(path)?), + other => { + tracing::warn!( + target: TARGET, + path = %path.display(), + extension = ?other, + "skipping unrecognised dictionary file", + ); + return Ok(()); + } + }; + + tracing::trace!( + target: TARGET, + name = dict.name(), + entries = dict.entries().len(), + "dictionary loaded from filesystem", + ); + self.insert(dict); + Ok(()) + } + + /// Load all `.txt` and `.csv` files from a filesystem directory. + /// + /// Files with unrecognised extensions are logged as warnings and + /// skipped. Loaded dictionaries are inserted into `self`, so this + /// can be called after [`load_builtins`](Self::load_builtins) to + /// layer user-provided dictionaries on top of the built-ins. + /// + /// # Errors + /// + /// Returns [`DictionaryLoadError`] if the directory cannot be read, + /// a file cannot be read, or a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] + pub fn load_dir(&mut self, dir: impl AsRef) -> Result<(), DictionaryLoadError> { + let dir = dir.as_ref(); + + let entries = std::fs::read_dir(dir).map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + + let mut count = 0usize; + for entry in entries { + let entry = entry.map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + let path = entry.path(); + + if !path.is_file() { + continue; + } + + self.load_file(&path)?; + count += 1; + } + + tracing::Span::current().record("count", count); + tracing::debug!(target: TARGET, "filesystem dictionaries loaded"); + Ok(()) } } impl Default for DictionaryRegistry { fn default() -> Self { - Self::new() + Self { + inner: BTreeMap::new(), + } } } -static BUILTIN_REGISTRY: LazyLock = - LazyLock::new(DictionaryRegistry::load_builtins); +static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { + let mut reg = DictionaryRegistry::new(); + reg.load_builtins(); + reg +}); /// Return a reference to the lazily-initialised built-in [`DictionaryRegistry`]. pub fn builtin_registry() -> &'static DictionaryRegistry { @@ -142,6 +279,8 @@ pub fn builtin_registry() -> &'static DictionaryRegistry { #[cfg(test)] mod tests { + use std::collections::HashSet; + use super::*; fn registry() -> &'static DictionaryRegistry { @@ -151,15 +290,15 @@ mod tests { #[test] fn builtins_load_and_are_nonempty() { let reg = registry(); - assert!(reg.len() > 0); - for (_, dict) in ®.inner { + assert!(!reg.is_empty()); + for (_, dict) in reg.iter() { assert!(!dict.entries().is_empty(), "{} is empty", dict.name()); } } #[test] fn entries_are_trimmed_and_nonempty() { - for (_, dict) in ®istry().inner { + for (_, dict) in registry().iter() { let name = dict.name(); for entry in dict.entries() { assert!(!entry.is_empty(), "empty entry in {name}"); @@ -168,9 +307,23 @@ mod tests { } } + #[test] + fn no_duplicate_entries_per_dictionary() { + for (_, dict) in registry().iter() { + let mut seen = HashSet::new(); + for entry in dict.entries() { + assert!( + seen.insert(entry.as_str()), + "duplicate entry {entry:?} in dictionary {}", + dict.name(), + ); + } + } + } + #[test] fn registry_names_are_sorted() { - let keys: Vec<&str> = registry().inner.keys().map(|s| s.as_str()).collect(); + let keys: Vec<&str> = registry().names().collect(); let mut sorted = keys.clone(); sorted.sort(); assert_eq!(keys, sorted); @@ -188,4 +341,35 @@ mod tests { assert_eq!(dict.name(), "test"); assert_eq!(dict.entries(), &["foo", "bar"]); } + + #[test] + fn load_dir_reads_filesystem() { + let dir = tempfile::tempdir().unwrap(); + + std::fs::write(dir.path().join("colors.txt"), "red\nblue\ngreen\n").unwrap(); + std::fs::write(dir.path().join("sizes.csv"), "small,S\nmedium,M\nlarge,L\n").unwrap(); + // Should be skipped. + std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); + + let mut reg = DictionaryRegistry::new(); + reg.load_dir(dir.path()).unwrap(); + + assert_eq!(reg.len(), 2); + + let colors = reg.get("colors").unwrap(); + assert_eq!(colors.entries(), &["red", "blue", "green"]); + + let sizes = reg.get("sizes").unwrap(); + assert_eq!( + sizes.entries(), + &["small", "S", "medium", "M", "large", "L"] + ); + } + + #[test] + fn load_dir_missing_directory() { + let mut reg = DictionaryRegistry::new(); + let result = reg.load_dir("/nonexistent/path"); + assert!(result.is_err()); + } } diff --git a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs index 6273470..d4519cb 100644 --- a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs @@ -1,5 +1,7 @@ //! Plain-text dictionary: one matchable entry per line. +use std::path::Path; + use super::Dictionary; /// A dictionary parsed from a plain-text file (one entry per line). @@ -26,6 +28,23 @@ impl TxtDictionary { Self { name, entries } } + + /// Load a plain-text dictionary from a file path. + /// + /// The dictionary name is derived from the file stem. + /// + /// # Errors + /// + /// Returns [`std::io::Error`] if the file cannot be read. + pub fn from_path(path: impl AsRef) -> std::io::Result { + let path = path.as_ref(); + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or_default(); + let text = std::fs::read_to_string(path)?; + Ok(Self::new(name, &text)) + } } impl Dictionary for TxtDictionary { diff --git a/crates/nvisy-pattern/src/engine/builder.rs b/crates/nvisy-pattern/src/engine/builder.rs index 3cfb4f8..581adb7 100644 --- a/crates/nvisy-pattern/src/engine/builder.rs +++ b/crates/nvisy-pattern/src/engine/builder.rs @@ -10,10 +10,12 @@ use crate::dictionaries; use crate::patterns::{self, MatchSource, Pattern}; use crate::validators::ValidatorResolver; +const TARGET: &str = "nvisy_pattern::engine"; + /// Builder for [`PatternEngine`]. /// /// By default all built-in patterns are included. Use -/// [`patterns`](Self::patterns) to restrict to a subset. +/// [`with_patterns`](Self::with_patterns) to restrict to a subset. #[derive(Default)] pub struct PatternEngineBuilder { pattern_names: Option>, @@ -69,14 +71,14 @@ impl PatternEngineBuilder { /// Returns [`PatternEngineError`] if a regex fails to compile, a /// referenced dictionary is missing, or the Aho-Corasick automaton /// cannot be built. - #[tracing::instrument(name = "PatternEngine::build", skip(self))] + #[tracing::instrument(target = TARGET, name = "PatternEngine::build", skip(self))] pub fn build(self) -> Result { let pat_reg = patterns::builtin_registry(); let dict_reg = dictionaries::builtin_registry(); let active: Vec<&dyn Pattern> = match &self.pattern_names { Some(names) => names.iter().filter_map(|n| pat_reg.get(n)).collect(), - None => pat_reg.values(), + None => pat_reg.iter().collect(), }; let mut regex_entries = Vec::new(); @@ -86,15 +88,16 @@ impl PatternEngineBuilder { for p in &active { match p.match_source() { MatchSource::Regex(rp) => { + let effective = rp.effective_regex(); let compiled = - Regex::new(&rp.regex).map_err(|e| PatternEngineError::RegexCompile { + Regex::new(&effective).map_err(|e| PatternEngineError::RegexCompile { name: p.name().to_owned(), source: e, })?; - regex_strings.push(rp.regex.clone()); + regex_strings.push(effective); regex_entries.push(RegexEntry { pattern_name: p.name().to_owned(), - category: p.category().clone(), + category: p.category(), entity_kind: p.entity_kind(), confidence: rp.confidence, validator_name: rp.validator.clone(), @@ -123,7 +126,7 @@ impl PatternEngineBuilder { })?; dict_entries.push(DictEntry { pattern_name: p.name().to_owned(), - category: p.category().clone(), + category: p.category(), entity_kind: p.entity_kind(), confidence: dp.confidence.clone(), automaton, @@ -140,6 +143,7 @@ impl PatternEngineBuilder { let validators = ValidatorResolver::builtins(); tracing::debug!( + target: TARGET, regex_count = regex_entries.len(), dict_count = dict_entries.len(), "PatternEngine built", diff --git a/crates/nvisy-pattern/src/engine/deny_list.rs b/crates/nvisy-pattern/src/engine/deny_list.rs index 946784c..620c4ca 100644 --- a/crates/nvisy-pattern/src/engine/deny_list.rs +++ b/crates/nvisy-pattern/src/engine/deny_list.rs @@ -1,24 +1,26 @@ //! [`DenyList`] — forced detection of known sensitive values. -use std::collections::HashMap; +use std::collections::BTreeMap; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; -/// A deny-list entry: a known sensitive value that must always be detected. +/// A deny-list rule: a known sensitive value that must always be detected. #[derive(Debug, Clone)] -pub struct DenyEntry { +pub struct DenyRule { /// Entity category for the injected match. pub category: EntityCategory, /// Entity kind for the injected match. pub entity_kind: EntityKind, + /// Recognition method to assign to injected matches. + pub method: RecognitionMethod, } /// Exact-match deny list for forcing detection of known sensitive values. /// /// If a deny-list value is found in the scanned text but was not already /// matched by any regex or dictionary pattern, it is injected as a synthetic -/// [`PatternMatch`](super::PatternMatch) with confidence `1.0` and source -/// [`DetectionSource::DenyList`](super::DetectionSource::DenyList). +/// [`RawMatch`](super::RawMatch) with confidence `1.0` and +/// `pattern_name: None`. /// /// # Examples /// @@ -26,12 +28,12 @@ pub struct DenyEntry { /// use nvisy_ontology::entity::{EntityCategory, EntityKind}; /// /// let deny = DenyList::new() -/// .with("John Doe", EntityCategory::Pii, EntityKind::PersonName) -/// .with("ACME Corp", EntityCategory::Pii, EntityKind::Organization); +/// .with("John Doe", EntityCategory::PersonalIdentity, EntityKind::PersonName) +/// .with("ACME Corp", EntityCategory::PersonalIdentity, EntityKind::OrganizationName); /// ``` #[derive(Debug, Clone, Default)] pub struct DenyList { - pub(crate) entries: HashMap, + pub(crate) entries: BTreeMap, } impl DenyList { @@ -40,7 +42,7 @@ impl DenyList { Self::default() } - /// Add a single entry. + /// Add a single rule with `RecognitionMethod::Dictionary` as the default method. pub fn with( mut self, value: impl Into, @@ -49,15 +51,35 @@ impl DenyList { ) -> Self { self.entries.insert( value.into(), - DenyEntry { + DenyRule { category, entity_kind, + method: RecognitionMethod::Dictionary, }, ); self } - /// Insert an entry into this list. + /// Add a single rule with an explicit recognition method. + pub fn with_method( + mut self, + value: impl Into, + category: EntityCategory, + entity_kind: EntityKind, + method: RecognitionMethod, + ) -> Self { + self.entries.insert( + value.into(), + DenyRule { + category, + entity_kind, + method, + }, + ); + self + } + + /// Insert a rule into this list with `RecognitionMethod::Dictionary` as the default method. pub fn insert( &mut self, value: impl Into, @@ -66,9 +88,10 @@ impl DenyList { ) { self.entries.insert( value.into(), - DenyEntry { + DenyRule { category, entity_kind, + method: RecognitionMethod::Dictionary, }, ); } @@ -79,9 +102,9 @@ impl DenyList { self.entries.contains_key(value) } - /// Look up the entry for a value. + /// Look up the rule for a value. #[must_use] - pub fn get(&self, value: &str) -> Option<&DenyEntry> { + pub fn get(&self, value: &str) -> Option<&DenyRule> { self.entries.get(value) } @@ -97,8 +120,8 @@ impl DenyList { self.entries.is_empty() } - /// Iterate over (value, entry) pairs. - pub fn iter(&self) -> impl Iterator { + /// Iterate over (value, rule) pairs. + pub fn iter(&self) -> impl Iterator { self.entries.iter().map(|(k, v)| (k.as_str(), v)) } } diff --git a/crates/nvisy-pattern/src/engine/mod.rs b/crates/nvisy-pattern/src/engine/mod.rs index fcf7077..366b0f9 100644 --- a/crates/nvisy-pattern/src/engine/mod.rs +++ b/crates/nvisy-pattern/src/engine/mod.rs @@ -10,8 +10,7 @@ //! - [`PatternEngine`]: the pre-compiled scanning engine. //! - [`PatternEngineBuilder`]: builder for configuring patterns, thresholds, //! and allow/deny lists. -//! - [`PatternMatch`]: a single match produced by scanning. -//! - [`DetectionSource`]: how a match was produced (regex, dictionary, deny list). +//! - [`RawMatch`]: a single match produced by scanning. //! - [`AllowList`] / [`DenyList`]: exact-match suppression and forced detection. //! - [`PatternEngineError`]: build-time errors. @@ -21,20 +20,23 @@ mod deny_list; mod error; mod pattern_match; +use std::collections::HashSet; use std::sync::LazyLock; use aho_corasick::AhoCorasick; -pub use allow_list::AllowList; -pub use builder::PatternEngineBuilder; -pub use deny_list::{DenyEntry, DenyList}; -pub use error::PatternEngineError; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; -pub use pattern_match::{DetectionSource, PatternMatch}; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; use regex::{Regex, RegexSet}; +pub use self::allow_list::AllowList; +pub use self::builder::PatternEngineBuilder; +pub use self::deny_list::{DenyList, DenyRule}; +pub use self::error::PatternEngineError; +pub use self::pattern_match::RawMatch; use crate::patterns::{ContextRule, DictionaryConfidence}; use crate::validators::ValidatorResolver; +const TARGET: &str = "nvisy_pattern::engine"; + /// Metadata stored alongside each compiled regex. struct RegexEntry { pattern_name: String, @@ -114,28 +116,13 @@ impl PatternEngine { PatternEngineBuilder::default() } - /// Validate a value using the checksum associated with the entity kind. - /// - /// Returns `Some(true)` if the value passes, `Some(false)` if it fails, - /// or `None` if no checksum validator is registered for that entity kind. - pub fn validate_checksum(&self, entity_kind: EntityKind, value: &str) -> Option { - let validator_name = match entity_kind { - EntityKind::PaymentCard => "luhn", - EntityKind::GovernmentId => "ssn", - EntityKind::Iban => "iban", - _ => return None, - }; - let validate = self.validators.resolve(validator_name)?; - Some(validate(value)) - } - /// Scan `text` and return all matches above the confidence threshold. /// /// Matches whose value appears in the allow list are suppressed. /// Deny-list values found in the text are injected as synthetic matches /// with confidence `1.0` when not already matched. - #[tracing::instrument(skip(self, text), fields(text_len = text.len(), matches))] - pub fn scan_text(&self, text: &str) -> Vec { + #[tracing::instrument(target = TARGET, skip(self, text), fields(text_len = text.len(), matches = tracing::field::Empty))] + pub fn scan_text(&self, text: &str) -> Vec { let mut results = Vec::new(); self.scan_regex(text, &mut results); @@ -148,7 +135,7 @@ impl PatternEngine { /// Phase 1: regex matches — use `RegexSet` as a pre-filter, then run /// each matching regex individually to extract offsets and values. - fn scan_regex(&self, text: &str, results: &mut Vec) { + fn scan_regex(&self, text: &str, results: &mut Vec) { let set_matches = self.regex_set.matches(text); for idx in set_matches.iter() { let entry = &self.regex_entries[idx]; @@ -164,22 +151,26 @@ impl PatternEngine { continue; } + let mut methods = vec![RecognitionMethod::Regex]; + if let Some(ref vname) = entry.validator_name && let Some(validate) = self.validators.resolve(vname) - && !validate(value) { - continue; + if !validate(value) { + continue; + } + methods.push(RecognitionMethod::Checksum); } - results.push(PatternMatch { - pattern_name: entry.pattern_name.clone(), - category: entry.category.clone(), + results.push(RawMatch { + pattern_name: Some(entry.pattern_name.clone()), + category: entry.category, entity_kind: entry.entity_kind, value: value.to_owned(), start: mat.start(), end: mat.end(), confidence: entry.confidence, - source: DetectionSource::Regex, + recognition_methods: methods, context: entry.context.clone(), }); } @@ -187,7 +178,7 @@ impl PatternEngine { } /// Phase 2: dictionary matches via Aho-Corasick automata. - fn scan_dict(&self, text: &str, results: &mut Vec) { + fn scan_dict(&self, text: &str, results: &mut Vec) { for entry in &self.dict_entries { for mat in entry.automaton.find_iter(text) { let pat_idx = mat.pattern().as_usize(); @@ -205,15 +196,15 @@ impl PatternEngine { continue; } - results.push(PatternMatch { - pattern_name: entry.pattern_name.clone(), - category: entry.category.clone(), + results.push(RawMatch { + pattern_name: Some(entry.pattern_name.clone()), + category: entry.category, entity_kind: entry.entity_kind, value: value.clone(), start: mat.start(), end: mat.end(), confidence, - source: DetectionSource::Dictionary, + recognition_methods: vec![RecognitionMethod::Dictionary], context: entry.context.clone(), }); } @@ -222,29 +213,33 @@ impl PatternEngine { /// Phase 3: inject deny-list values found in `text` that were not /// already matched by regex or dictionary. - fn scan_deny_list(&self, text: &str, results: &mut Vec) { - for (deny_value, deny_entry) in self.deny_set.iter() { - if results.iter().any(|r| r.value == deny_value) { + fn scan_deny_list(&self, text: &str, results: &mut Vec) { + let matched_values: HashSet<&str> = results.iter().map(|r| r.value.as_str()).collect(); + + let mut deny_matches = Vec::new(); + for (deny_value, deny_rule) in self.deny_set.iter() { + if matched_values.contains(deny_value) { continue; } let mut search_start = 0; while let Some(pos) = text[search_start..].find(deny_value) { let abs_start = search_start + pos; let abs_end = abs_start + deny_value.len(); - results.push(PatternMatch { - pattern_name: String::new(), - category: deny_entry.category.clone(), - entity_kind: deny_entry.entity_kind, + deny_matches.push(RawMatch { + pattern_name: None, + category: deny_rule.category, + entity_kind: deny_rule.entity_kind, value: deny_value.to_owned(), start: abs_start, end: abs_end, confidence: 1.0, - source: DetectionSource::DenyList, + recognition_methods: vec![deny_rule.method], context: None, }); search_start = abs_end; } } + results.extend(deny_matches); } } @@ -275,7 +270,9 @@ mod tests { let engine = default_engine(); let matches = engine.scan_text("My SSN is 123-45-6789."); assert!( - matches.iter().any(|m| m.pattern_name == "ssn"), + matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "expected SSN match, got: {:?}", matches.iter().map(|m| &m.pattern_name).collect::>() ); @@ -286,7 +283,9 @@ mod tests { let engine = default_engine(); let matches = engine.scan_text("Contact: alice@example.com"); assert!( - matches.iter().any(|m| m.pattern_name == "email"), + matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("email")), "expected email match, got: {:?}", matches.iter().map(|m| &m.pattern_name).collect::>() ); @@ -300,7 +299,9 @@ mod tests { .unwrap(); let matches = engine.scan_text("My SSN is 123-45-6789."); assert!( - !matches.iter().any(|m| m.pattern_name == "ssn"), + !matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "SSN should be filtered by 0.99 threshold" ); } @@ -320,7 +321,10 @@ mod tests { let engine = default_engine(); let text = "SSN: 123-45-6789"; let matches = engine.scan_text(text); - let ssn_match = matches.iter().find(|m| m.pattern_name == "ssn").unwrap(); + let ssn_match = matches + .iter() + .find(|m| m.pattern_name.as_deref() == Some("ssn")) + .unwrap(); assert_eq!(&text[ssn_match.start..ssn_match.end], "123-45-6789"); } @@ -329,13 +333,13 @@ mod tests { let engine = default_engine(); let matches = engine.scan_text("She is American and speaks English."); assert!( - matches - .iter() - .any(|m| m.source == DetectionSource::Dictionary), + matches.iter().any(|m| m + .recognition_methods + .contains(&RecognitionMethod::Dictionary)), "expected dictionary match, got: {:?}", matches .iter() - .map(|m| (&m.pattern_name, &m.source)) + .map(|m| (&m.pattern_name, &m.recognition_methods)) .collect::>() ); } @@ -349,7 +353,9 @@ mod tests { .unwrap(); let matches = engine.scan_text("SSN: 123-45-6789"); assert!( - !matches.iter().any(|m| m.pattern_name == "ssn"), + !matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "allow-listed value should be suppressed" ); } @@ -358,7 +364,7 @@ mod tests { fn deny_list_injects_match() { let deny = DenyList::new().with( "secret-value-42", - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, ); let engine = PatternEngine::builder() @@ -369,16 +375,24 @@ mod tests { let matches = engine.scan_text("The secret-value-42 should be detected."); let deny_match = matches .iter() - .find(|m| m.source == DetectionSource::DenyList) + .find(|m| m.pattern_name.is_none()) .expect("deny list value should be injected"); assert_eq!(deny_match.value, "secret-value-42"); assert_eq!(deny_match.confidence, 1.0); assert_eq!(deny_match.entity_kind, EntityKind::PersonName); + assert_eq!( + deny_match.recognition_methods, + vec![RecognitionMethod::Dictionary] + ); } #[test] fn deny_list_not_injected_when_absent() { - let deny = DenyList::new().with("not-in-text", EntityCategory::Pii, EntityKind::PersonName); + let deny = DenyList::new().with( + "not-in-text", + EntityCategory::PersonalIdentity, + EntityKind::PersonName, + ); let engine = PatternEngine::builder() .with_patterns(&["email"]) .with_deny(deny) @@ -386,9 +400,7 @@ mod tests { .unwrap(); let matches = engine.scan_text("Nothing special here."); assert!( - !matches - .iter() - .any(|m| m.source == DetectionSource::DenyList), + !matches.iter().any(|m| m.pattern_name.is_none()), "deny list value not in text should not be injected" ); } @@ -405,15 +417,19 @@ mod tests { #[test] fn deny_list_from_iterator() { let deny: DenyList = [ - ("secret", EntityCategory::Pii, EntityKind::PersonName), + ( + "secret", + EntityCategory::PersonalIdentity, + EntityKind::PersonName, + ), ("other", EntityCategory::Financial, EntityKind::PaymentCard), ] .into_iter() .collect(); assert_eq!(deny.len(), 2); assert!(deny.contains("secret")); - let entry = deny.get("other").unwrap(); - assert_eq!(entry.category, EntityCategory::Financial); + let rule = deny.get("other").unwrap(); + assert_eq!(rule.category, EntityCategory::Financial); } #[test] @@ -440,14 +456,41 @@ mod tests { .build() .unwrap(); let matches = engine.scan_text("SSN: 123-45-6789"); - let ssn_match = matches.iter().find(|m| m.pattern_name == "ssn").unwrap(); + let ssn_match = matches + .iter() + .find(|m| m.pattern_name.as_deref() == Some("ssn")) + .unwrap(); assert!( ssn_match.context.is_some(), - "SSN pattern should carry context rule through to PatternMatch" + "SSN pattern should carry context rule through to RawMatch" ); let ctx = ssn_match.context.as_ref().unwrap(); assert!(!ctx.keywords.is_empty()); assert!(ctx.window > 0); assert!(ctx.boost > 0.0); } + + #[test] + fn into_entity_builds_entity_without_location() { + let raw = RawMatch { + pattern_name: Some("ssn".into()), + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::GovernmentId, + value: "123-45-6789".into(), + start: 5, + end: 16, + confidence: 0.9, + recognition_methods: vec![RecognitionMethod::Regex, RecognitionMethod::Checksum], + context: None, + }; + let entity = raw.into_entity(); + assert_eq!(entity.value, "123-45-6789"); + assert_eq!(entity.entity_kind, EntityKind::GovernmentId); + assert_eq!( + entity.recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Checksum] + ); + assert!((entity.confidence - 0.9).abs() < f64::EPSILON); + assert!(entity.location.is_none()); + } } diff --git a/crates/nvisy-pattern/src/engine/pattern_match.rs b/crates/nvisy-pattern/src/engine/pattern_match.rs index a968d61..91c38ef 100644 --- a/crates/nvisy-pattern/src/engine/pattern_match.rs +++ b/crates/nvisy-pattern/src/engine/pattern_match.rs @@ -1,25 +1,15 @@ -//! [`PatternMatch`] and [`DetectionSource`] — output types from pattern scanning. +//! [`RawMatch`] — output type from pattern scanning. -use nvisy_ontology::entity::{EntityCategory, EntityKind}; +use nvisy_ontology::entity::{Entity, EntityCategory, EntityKind, RecognitionMethod}; use crate::patterns::ContextRule; -/// How the match was produced. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum DetectionSource { - /// Matched by a compiled regular expression. - Regex, - /// Matched by Aho-Corasick dictionary lookup. - Dictionary, - /// Injected by the deny list (known sensitive value). - DenyList, -} - /// A single match produced by [`PatternEngine::scan_text`](super::PatternEngine::scan_text). #[derive(Debug, Clone)] -pub struct PatternMatch { - /// Name of the pattern that produced this match. - pub pattern_name: String, +pub struct RawMatch { + /// Name of the pattern that produced this match, or `None` for + /// deny-list injected matches. + pub pattern_name: Option, /// Entity category of the match. pub category: EntityCategory, /// Entity kind of the match. @@ -32,8 +22,37 @@ pub struct PatternMatch { pub end: usize, /// Confidence score assigned by the pattern definition. pub confidence: f64, - /// How this match was produced (regex, dictionary, or deny list). - pub source: DetectionSource, + /// Recognition methods that produced this match, ordered by + /// application time (e.g. `[Regex, Checksum]` when a regex + /// match was confirmed by a validator). + pub recognition_methods: Vec, /// Optional context rule for span-level co-occurrence scoring. pub context: Option, } + +impl RawMatch { + /// Build an [`Entity`] from this match. + /// + /// The returned entity has no location or parent set: the caller + /// should attach those from the span context via + /// [`Entity::with_location`] and [`Entity::with_parent`]. + /// # Panics + /// + /// Panics if `recognition_methods` is empty. All engine-produced + /// matches always carry at least one method. + pub fn into_entity(self) -> Entity { + assert!( + !self.recognition_methods.is_empty(), + "RawMatch::into_entity requires at least one recognition method" + ); + let mut entity = Entity::new( + self.category, + self.entity_kind, + self.value, + self.recognition_methods[0], + self.confidence, + ); + entity.recognition_methods = self.recognition_methods; + entity + } +} diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index 9f50527..fc1e04e 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -2,13 +2,14 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub(crate) mod dictionaries; +pub mod dictionaries; pub mod engine; pub(crate) mod patterns; pub(crate) mod validators; -pub use engine::{DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch}; -pub use patterns::ContextRule; +pub use self::dictionaries::{DictionaryLoadError, DictionaryRegistry}; +pub use self::engine::{PatternEngine, PatternEngineBuilder, RawMatch}; +pub use self::patterns::ContextRule; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-pattern/src/patterns/context_rule.rs b/crates/nvisy-pattern/src/patterns/context_rule.rs index ef0feb0..843c1ee 100644 --- a/crates/nvisy-pattern/src/patterns/context_rule.rs +++ b/crates/nvisy-pattern/src/patterns/context_rule.rs @@ -1,29 +1,59 @@ //! [`ContextRule`] — co-occurrence context for span-level confidence boosting. -use serde::{Deserialize, Serialize}; +use serde::Deserialize; /// Co-occurrence context rule for span-level confidence boosting. /// /// When a pattern match is found, nearby spans are searched for any of the /// `keywords`. If at least one keyword is present within `window` spans, /// the match confidence is increased by `boost` (clamped to `[0.0, 1.0]`). -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] +#[serde(try_from = "RawContextRule")] pub struct ContextRule { /// Keywords to look for in nearby spans. pub keywords: Vec, /// Number of spans before and after the match span to search. - #[serde(default = "default_window")] pub window: usize, /// Confidence adjustment when at least one keyword is found. - #[serde(default = "default_boost")] + /// Must be in the range `[0.0, 1.0]`. pub boost: f64, /// Whether keyword matching is case-sensitive. /// /// Defaults to `false` (case-insensitive). - #[serde(default)] pub case_sensitive: bool, } +/// Serde intermediary that mirrors the JSON shape before validation. +#[derive(Deserialize)] +struct RawContextRule { + keywords: Vec, + #[serde(default = "default_window")] + window: usize, + #[serde(default = "default_boost")] + boost: f64, + #[serde(default)] + case_sensitive: bool, +} + +impl TryFrom for ContextRule { + type Error = String; + + fn try_from(raw: RawContextRule) -> Result { + if !(0.0..=1.0).contains(&raw.boost) { + return Err(format!( + "context rule boost must be in [0.0, 1.0], got {}", + raw.boost + )); + } + Ok(Self { + keywords: raw.keywords, + window: raw.window, + boost: raw.boost, + case_sensitive: raw.case_sensitive, + }) + } +} + fn default_window() -> usize { 3 } diff --git a/crates/nvisy-pattern/src/patterns/json_pattern.rs b/crates/nvisy-pattern/src/patterns/json_pattern.rs index 750e2ac..a7cfc0d 100644 --- a/crates/nvisy-pattern/src/patterns/json_pattern.rs +++ b/crates/nvisy-pattern/src/patterns/json_pattern.rs @@ -10,6 +10,7 @@ use serde::Deserialize; use super::context_rule::ContextRule; use super::pattern::{DictionaryPattern, MatchSource, Pattern, RegexPattern}; +use crate::validators::ValidatorResolver; /// Error returned when a JSON pattern file cannot be loaded. #[derive(Debug, thiserror::Error)] @@ -25,10 +26,6 @@ pub enum JsonPatternError { /// indicate misconfiguration (e.g. a typo in the validator name). #[derive(Debug)] pub enum JsonPatternWarning { - /// The `"category"` value was not a recognised variant and fell through - /// to [`EntityCategory::Custom`]. - UnknownCategory { pattern: String, slug: String }, - /// The `"validator"` name does not match any built-in validator, so /// the pattern will have no post-match validation. UnknownValidator { pattern: String, validator: String }, @@ -50,6 +47,10 @@ pub struct JsonPattern { impl JsonPattern { /// Deserialize and validate a pattern from raw JSON bytes. /// + /// `validators` is used to check whether a referenced validator name + /// is registered; unrecognised names produce a [`JsonPatternWarning`] + /// but do not prevent loading. + /// /// On success returns the pattern together with a (possibly empty) /// list of [`JsonPatternWarning`]s. /// @@ -60,6 +61,7 @@ impl JsonPattern { /// and `dictionary`). pub(crate) fn from_bytes( bytes: &[u8], + validators: &ValidatorResolver, ) -> Result<(Self, Vec), JsonPatternError> { /// Serde helper: exactly one of `pattern` or `dictionary`. #[derive(Deserialize)] @@ -91,19 +93,11 @@ impl JsonPattern { let mut warnings = Vec::new(); - if let EntityCategory::Custom(ref slug) = raw.category { - warnings.push(JsonPatternWarning::UnknownCategory { - pattern: raw.name.clone(), - slug: slug.clone(), - }); - } if let MatchSource::Regex(RegexPattern { validator: Some(ref v), .. }) = match_source - && crate::validators::ValidatorResolver::builtins() - .resolve(v) - .is_none() + && validators.resolve(v).is_none() { warnings.push(JsonPatternWarning::UnknownValidator { pattern: raw.name.clone(), @@ -128,8 +122,8 @@ impl Pattern for JsonPattern { &self.name } - fn category(&self) -> &EntityCategory { - &self.category + fn category(&self) -> EntityCategory { + self.category } fn entity_kind(&self) -> EntityKind { diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index 6085e4b..ff0e5db 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -20,10 +20,14 @@ mod pattern; use std::collections::BTreeMap; use std::sync::LazyLock; -pub use context_rule::ContextRule; use include_dir::{Dir, include_dir}; -pub use json_pattern::{JsonPattern, JsonPatternWarning}; -pub use pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; + +pub use self::context_rule::ContextRule; +pub use self::json_pattern::{JsonPattern, JsonPatternWarning}; +pub use self::pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; +use crate::validators::ValidatorResolver; + +const TARGET: &str = "nvisy_pattern::patterns"; /// A registry of named [`Pattern`] definitions with O(log n) lookup. /// @@ -65,10 +69,15 @@ impl PatternRegistry { self.inner.get(name).map(|b| b.as_ref()) } - /// All patterns in deterministic (alphabetical) order. - #[must_use] - pub fn values(&self) -> Vec<&dyn Pattern> { - self.inner.values().map(|b| b.as_ref()).collect() + /// Iterate over all registered patterns as `&dyn Pattern` in + /// deterministic (alphabetical) order. + pub fn iter(&self) -> impl Iterator { + self.inner.values().map(|b| b.as_ref()) + } + + /// Iterate over all registered pattern names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) } /// Total number of registered patterns. @@ -77,14 +86,21 @@ impl PatternRegistry { self.inner.len() } + /// Whether the registry contains no patterns. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + /// Load all `.json` files from the embedded `assets/patterns/` /// directory and return a populated registry. /// /// Files that fail to parse are logged as warnings and skipped. - #[tracing::instrument(name = "patterns.load_builtins", fields(count))] + #[tracing::instrument(target = TARGET, name = "patterns.load_builtins", fields(count))] pub fn load_builtins() -> Self { static PATTERN_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/patterns"); + let validators = ValidatorResolver::builtins(); let mut reg = Self::new(); for file in PATTERN_DIR.files() { @@ -92,16 +108,18 @@ impl PatternRegistry { let Some("json") = path.extension().and_then(|e| e.to_str()) else { tracing::warn!( + target: TARGET, path = %path.display(), "skipping non-JSON file in patterns directory", ); continue; }; - let (pattern, warnings) = match JsonPattern::from_bytes(file.contents()) { + let (pattern, warnings) = match JsonPattern::from_bytes(file.contents(), &validators) { Ok(pair) => pair, Err(e) => { tracing::warn!( + target: TARGET, path = %path.display(), error = %e, "failed to load pattern, skipping", @@ -112,16 +130,14 @@ impl PatternRegistry { for w in &warnings { match w { - JsonPatternWarning::UnknownCategory { pattern, slug } => { - tracing::warn!(%pattern, category = %slug, "unrecognised category falls through to Custom"); - } JsonPatternWarning::UnknownValidator { pattern, validator } => { - tracing::warn!(%pattern, %validator, "unknown validator name, pattern will have no post-match validation"); + tracing::warn!(target: TARGET, %pattern, %validator, "unknown validator name, pattern will have no post-match validation"); } } } tracing::trace!( + target: TARGET, name = %pattern.name(), category = %pattern.category(), entity_kind = %pattern.entity_kind(), @@ -132,7 +148,7 @@ impl PatternRegistry { } tracing::Span::current().record("count", reg.len()); - tracing::debug!("built-in patterns loaded"); + tracing::debug!(target: TARGET, "built-in patterns loaded"); reg } } @@ -161,12 +177,12 @@ mod tests { #[test] fn builtins_load() { - assert!(registry().len() > 0); + assert!(!registry().is_empty()); } #[test] fn pattern_names_are_sorted() { - let names: Vec<&str> = registry().values().iter().map(|p| p.name()).collect(); + let names: Vec<&str> = registry().names().collect(); let mut sorted = names.clone(); sorted.sort(); assert_eq!(names, sorted); @@ -174,15 +190,14 @@ mod tests { #[test] fn no_duplicate_pattern_names() { - let all = registry().values(); - let names: Vec<_> = all.iter().map(|p| p.name()).collect(); + let names: Vec<_> = registry().names().collect(); let unique: std::collections::HashSet<_> = names.iter().collect(); assert_eq!(names.len(), unique.len(), "duplicate pattern names found"); } #[test] fn all_patterns_have_valid_fields() { - for p in registry().values() { + for p in registry().iter() { assert!(!p.name().is_empty(), "pattern name is empty"); match p.match_source() { MatchSource::Regex(rp) => { @@ -202,10 +217,10 @@ mod tests { #[test] fn all_regex_patterns_compile() { - for p in registry().values() { + for p in registry().iter() { if let MatchSource::Regex(rp) = p.match_source() { assert!( - regex::Regex::new(&rp.regex).is_ok(), + regex::Regex::new(&rp.effective_regex()).is_ok(), "pattern {} failed to compile: {}", p.name(), rp.regex, @@ -217,7 +232,7 @@ mod tests { #[test] fn all_validators_resolve() { let resolver = crate::validators::ValidatorResolver::builtins(); - for p in registry().values() { + for p in registry().iter() { if let MatchSource::Regex(RegexPattern { validator: Some(name), .. @@ -234,13 +249,14 @@ mod tests { #[test] fn registry_insert_and_get() { + let validators = ValidatorResolver::builtins(); let json = br#"{ "name": "test", - "category": "pii", + "category": "personal_identity", "entity_type": "government_id", "pattern": { "regex": "\\d+", "confidence": 0.9 } }"#; - let (pattern, _warnings) = JsonPattern::from_bytes(json).unwrap(); + let (pattern, _warnings) = JsonPattern::from_bytes(json, &validators).unwrap(); let mut reg = PatternRegistry::new(); reg.insert(Box::new(pattern)); diff --git a/crates/nvisy-pattern/src/patterns/pattern.rs b/crates/nvisy-pattern/src/patterns/pattern.rs index 95e4b6f..4f1091c 100644 --- a/crates/nvisy-pattern/src/patterns/pattern.rs +++ b/crates/nvisy-pattern/src/patterns/pattern.rs @@ -22,9 +22,9 @@ pub struct RegexPattern { pub validator: Option, /// Whether the regex is case-sensitive. /// - /// Defaults to `false`. When `false`, the regex is compiled with - /// inline `(?i)` or equivalent flag. - #[serde(default)] + /// Defaults to `true`. When `false`, the regex is compiled with + /// an inline `(?i)` prefix. + #[serde(default = "default_case_sensitive")] pub case_sensitive: bool, /// Confidence score (0.0–1.0) assigned to matches from this pattern. /// @@ -33,6 +33,19 @@ pub struct RegexPattern { pub confidence: f64, } +impl RegexPattern { + /// Return the regex string ready for compilation. + /// + /// Prepends `(?i)` when [`case_sensitive`](Self::case_sensitive) is `false`. + pub fn effective_regex(&self) -> String { + if self.case_sensitive { + self.regex.clone() + } else { + format!("(?i){}", self.regex) + } + } +} + /// Confidence for a dictionary pattern: either a single uniform score /// or per-column scores for CSV dictionaries. #[derive(Debug, Clone, PartialEq)] @@ -126,6 +139,17 @@ pub enum MatchSource { Dictionary(DictionaryPattern), } +/// Default confidence score when `"confidence"` is omitted from JSON. +pub const DEFAULT_CONFIDENCE: f64 = 1.0; + +fn default_confidence() -> f64 { + DEFAULT_CONFIDENCE +} + +fn default_case_sensitive() -> bool { + true +} + /// A named detection pattern. /// /// Implementors describe a single entity type to detect, including how to @@ -137,19 +161,12 @@ pub enum MatchSource { /// from the JSON files under `assets/patterns/`. /// /// [`JsonPattern`]: super::JsonPattern -/// Default confidence score when `"confidence"` is omitted from JSON. -pub const DEFAULT_CONFIDENCE: f64 = 1.0; - -fn default_confidence() -> f64 { - DEFAULT_CONFIDENCE -} - pub trait Pattern: Send + Sync { /// Unique name identifying this pattern (e.g. `"ssn"`, `"credit-card"`). fn name(&self) -> &str; - /// High-level entity category (PII, Financial, Credentials, ...). - fn category(&self) -> &EntityCategory; + /// High-level entity category (PersonalIdentity, Financial, Credentials, ...). + fn category(&self) -> EntityCategory; /// Specific entity kind within the category (e.g. `GovernmentId`, `PaymentCard`). fn entity_kind(&self) -> EntityKind; diff --git a/crates/nvisy-pattern/src/prelude.rs b/crates/nvisy-pattern/src/prelude.rs index 4924718..3594e8f 100644 --- a/crates/nvisy-pattern/src/prelude.rs +++ b/crates/nvisy-pattern/src/prelude.rs @@ -4,4 +4,7 @@ //! use nvisy_pattern::prelude::*; //! ``` -pub use crate::{ContextRule, DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch}; +pub use crate::{ + ContextRule, DictionaryLoadError, DictionaryRegistry, PatternEngine, PatternEngineBuilder, + RawMatch, +}; diff --git a/crates/nvisy-pattern/src/validators/luhn.rs b/crates/nvisy-pattern/src/validators/luhn.rs index 8a1bd53..dd257fd 100644 --- a/crates/nvisy-pattern/src/validators/luhn.rs +++ b/crates/nvisy-pattern/src/validators/luhn.rs @@ -2,29 +2,47 @@ //! //! Implements the [Luhn algorithm](https://en.wikipedia.org/wiki/Luhn_algorithm) //! used to validate credit/debit card numbers and other identification -//! numbers. Non-digit characters (spaces, dashes) are stripped before -//! the check. +//! numbers. Only digits, spaces, and dashes are accepted as input: any +//! other character causes the check to fail. /// Return `true` if `num` passes the Luhn checksum. /// -/// All non-digit characters are ignored, so `"4539 1488 0343 6467"`, -/// `"4539-1488-0343-6467"`, and `"4539148803436467"` are equivalent. +/// Spaces and dashes are stripped before validation, so +/// `"4539 1488 0343 6467"`, `"4539-1488-0343-6467"`, and +/// `"4539148803436467"` are all equivalent. +/// +/// Returns `false` if the input is empty or contains characters other +/// than digits, spaces, and dashes. pub fn luhn_check(num: &str) -> bool { - let digits: String = num.chars().filter(|c| c.is_ascii_digit()).collect(); + if num.is_empty() { + return false; + } + + // Reject anything that isn't a digit, space, or dash. + if !num + .chars() + .all(|c| c.is_ascii_digit() || c == ' ' || c == '-') + { + return false; + } + + let digits: Vec = num.chars().filter_map(|c| c.to_digit(10)).collect(); + if digits.is_empty() { return false; } + let mut sum = 0u32; let mut alternate = false; - for ch in digits.chars().rev() { - let mut n = ch.to_digit(10).unwrap_or(0); + for &n in digits.iter().rev() { + let mut d = n; if alternate { - n *= 2; - if n > 9 { - n -= 9; + d *= 2; + if d > 9 { + d -= 9; } } - sum += n; + sum += d; alternate = !alternate; } sum.is_multiple_of(10) @@ -57,8 +75,20 @@ mod tests { assert!(!luhn_check("abcdef")); } + #[test] + fn mixed_alpha_digit_rejected() { + assert!(!luhn_check("45abc39")); + assert!(!luhn_check("4539 14X8 0343 6467")); + } + #[test] fn single_zero() { assert!(luhn_check("0")); } + + #[test] + fn only_separators_rejected() { + assert!(!luhn_check(" ")); + assert!(!luhn_check("---")); + } } diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index bb57cd1..cbac8a0 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -10,9 +10,9 @@ mod ssn; use std::collections::HashMap; -pub use iban::validate_iban; -pub use luhn::luhn_check; -pub use ssn::validate_ssn; +pub use self::iban::validate_iban; +pub use self::luhn::luhn_check; +pub use self::ssn::validate_ssn; /// Signature for a validation function: takes the matched text and returns /// `true` if the value is valid. From 6b79c39641436e0558bf3522720140a35a8e9083 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 18:22:18 +0100 Subject: [PATCH 05/10] refactor(ontology): remove Regulation type from prelude re-exports Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-ontology/src/prelude.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index 1d2b442..92a1eab 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -2,7 +2,7 @@ pub use crate::context::{Context, ContextEntry, ContextEntryData}; pub use crate::entity::{ - Annotation, AnnotationKind, DetectionMethod, DetectionOutput, Entities, Entity, EntityCategory, - EntityKind, EntitySensitivity, Location, + Annotation, AnnotationKind, DetectionOutput, Entities, Entity, EntityCategory, EntityKind, + EntitySensitivity, ExtractionMethod, Location, RecognitionMethod, RefinementMethod, }; pub use crate::policy::{Policies, Policy, PolicyRule, Strategy}; From 29668a918cf76236d70935f21152b7fad90deb65 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 18:22:36 +0100 Subject: [PATCH 06/10] style: enforce self:: prefix on internal module re-exports across workspace Use self:: prefix on all pub use re-exports in mod.rs and lib.rs files to visually distinguish internal submodule imports from external crate imports. Applied consistently across all workspace crates. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-cli/src/config/mod.rs | 5 ++-- crates/nvisy-cli/src/server/mod.rs | 2 +- crates/nvisy-codec/src/document/mod.rs | 4 ++-- crates/nvisy-codec/src/handler/audio/mod.rs | 16 ++++++------- crates/nvisy-codec/src/handler/image/mod.rs | 16 ++++++------- crates/nvisy-codec/src/handler/mod.rs | 9 +++---- crates/nvisy-codec/src/handler/rich/mod.rs | 8 +++---- crates/nvisy-codec/src/handler/text/mod.rs | 24 +++++++++---------- crates/nvisy-codec/src/lib.rs | 2 +- crates/nvisy-codec/src/transform/audio/mod.rs | 4 ++-- crates/nvisy-codec/src/transform/image/mod.rs | 4 ++-- crates/nvisy-codec/src/transform/mod.rs | 6 ++--- crates/nvisy-codec/src/transform/text/mod.rs | 4 ++-- crates/nvisy-core/src/content/mod.rs | 14 +++++------ crates/nvisy-core/src/lib.rs | 2 +- crates/nvisy-core/src/math/mod.rs | 8 +++---- crates/nvisy-core/src/media/mod.rs | 4 ++-- crates/nvisy-engine/src/compiler/graph/mod.rs | 6 ++--- crates/nvisy-engine/src/compiler/mod.rs | 11 +++++---- .../nvisy-engine/src/compiler/policy/mod.rs | 4 ++-- crates/nvisy-engine/src/lib.rs | 6 ++--- .../nvisy-engine/src/operation/context/mod.rs | 6 ++--- .../src/operation/envelope/mod.rs | 8 +++---- .../src/operation/inference/mod.rs | 16 ++++++------- .../src/operation/lifecycle/mod.rs | 14 +++++------ crates/nvisy-engine/src/operation/mod.rs | 5 ++-- .../src/operation/processing/mod.rs | 14 +++++------ crates/nvisy-engine/src/pipeline/mod.rs | 12 ++++++---- .../nvisy-engine/src/pipeline/policy/mod.rs | 4 ++-- .../nvisy-engine/src/provenance/action/mod.rs | 6 ++--- crates/nvisy-engine/src/provenance/mod.rs | 10 ++++---- .../nvisy-engine/src/provenance/record/mod.rs | 10 ++++---- crates/nvisy-http/src/lib.rs | 2 +- crates/nvisy-ocr/src/backend/mod.rs | 5 ++-- crates/nvisy-ocr/src/engine/mod.rs | 2 +- crates/nvisy-ocr/src/lib.rs | 4 ++-- .../src/provider/aws_textract/mod.rs | 4 ++-- .../nvisy-ocr/src/provider/azure_docai/mod.rs | 4 ++-- .../src/provider/datalab_surya/mod.rs | 4 ++-- .../src/provider/google_vision/mod.rs | 4 ++-- crates/nvisy-ocr/src/provider/mod.rs | 10 ++++---- .../src/provider/paddle_paddlex/mod.rs | 4 ++-- .../src/context/analytic/mod.rs | 5 ++-- .../src/context/biometric/mod.rs | 5 ++-- .../src/context/document/mod.rs | 5 ++-- .../src/context/geospatial/mod.rs | 7 +++--- crates/nvisy-ontology/src/context/mod.rs | 3 ++- .../src/context/reference/mod.rs | 9 +++---- .../src/context/temporal/mod.rs | 3 ++- .../nvisy-ontology/src/entity/location/mod.rs | 9 +++---- crates/nvisy-ontology/src/policy/mod.rs | 12 +++++----- .../nvisy-ontology/src/policy/strategy/mod.rs | 7 +++--- crates/nvisy-python/src/bridge/mod.rs | 3 ++- crates/nvisy-registry/src/lib.rs | 2 +- crates/nvisy-registry/src/store/mod.rs | 6 ++--- crates/nvisy-rig/src/agent/base/mod.rs | 16 ++++++------- crates/nvisy-rig/src/agent/cv/mod.rs | 6 ++--- crates/nvisy-rig/src/agent/generate/mod.rs | 4 ++-- crates/nvisy-rig/src/agent/mod.rs | 16 ++++++++----- crates/nvisy-rig/src/agent/ner/mod.rs | 6 ++--- crates/nvisy-rig/src/agent/ocr/input.rs | 2 +- crates/nvisy-rig/src/agent/ocr/mod.rs | 6 ++--- crates/nvisy-rig/src/agent/ocr/prompt.rs | 4 ++-- crates/nvisy-rig/src/audio/mod.rs | 4 ++-- crates/nvisy-rig/src/audio/stt/mod.rs | 4 ++-- crates/nvisy-rig/src/audio/tts/mod.rs | 4 ++-- crates/nvisy-rig/src/backend/mod.rs | 4 ++-- crates/nvisy-rig/src/backend/provider/mod.rs | 4 ++-- crates/nvisy-server/src/extract/mod.rs | 6 ++--- crates/nvisy-server/src/handler/error/mod.rs | 4 ++-- crates/nvisy-server/src/handler/mod.rs | 2 +- .../nvisy-server/src/handler/request/mod.rs | 8 +++---- .../nvisy-server/src/handler/response/mod.rs | 10 ++++---- crates/nvisy-server/src/lib.rs | 6 ++--- crates/nvisy-server/src/middleware/mod.rs | 10 ++++---- 75 files changed, 265 insertions(+), 244 deletions(-) diff --git a/crates/nvisy-cli/src/config/mod.rs b/crates/nvisy-cli/src/config/mod.rs index 32ae3a3..f09f45b 100644 --- a/crates/nvisy-cli/src/config/mod.rs +++ b/crates/nvisy-cli/src/config/mod.rs @@ -31,11 +31,12 @@ mod server; use std::path::PathBuf; use clap::Parser; -pub use file::MiddlewareSection; use nvisy_engine::RuntimeConfig; -pub use server::{ResolvedServer, ServerConfig}; use tracing_subscriber::EnvFilter; +pub use self::file::MiddlewareSection; +pub use self::server::{ResolvedServer, ServerConfig}; + /// Top-level CLI entry point. /// /// Parses command-line arguments and loads the TOML configuration file. diff --git a/crates/nvisy-cli/src/server/mod.rs b/crates/nvisy-cli/src/server/mod.rs index 8f1510f..7ae6dbc 100644 --- a/crates/nvisy-cli/src/server/mod.rs +++ b/crates/nvisy-cli/src/server/mod.rs @@ -3,4 +3,4 @@ mod listen; mod shutdown; -pub use listen::run; +pub use self::listen::run; diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index c8303de..fdfb5a7 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -9,9 +9,9 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{ AudioFormat, DocumentType, ImageFormat, SpreadsheetFormat, TextFormat, WordFormat, }; -pub use span::Span; -pub use stream::SpanStream; +pub use self::span::Span; +pub use self::stream::SpanStream; use crate::handler::{ BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, HtmlLoader, HtmlParams, JpegLoader, JpegParams, JsonLoader, JsonParams, Loader, diff --git a/crates/nvisy-codec/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs index 8a3825e..0a58386 100644 --- a/crates/nvisy-codec/src/handler/audio/mod.rs +++ b/crates/nvisy-codec/src/handler/audio/mod.rs @@ -14,14 +14,14 @@ mod mp3_loader; mod wav_handler; mod wav_loader; -pub use audio_data::AudioData; -pub use audio_handler::BoxedAudioHandler; -use audio_handler_macro::impl_audio_handler; -pub use audio_span_id::AudioSpanId; -pub use mp3_handler::Mp3Handler; -pub use mp3_loader::{Mp3Loader, Mp3Params}; -pub use wav_handler::WavHandler; -pub use wav_loader::{WavLoader, WavParams}; +pub use self::audio_data::AudioData; +pub use self::audio_handler::BoxedAudioHandler; +use self::audio_handler_macro::impl_audio_handler; +pub use self::audio_span_id::AudioSpanId; +pub use self::mp3_handler::Mp3Handler; +pub use self::mp3_loader::{Mp3Loader, Mp3Params}; +pub use self::wav_handler::WavHandler; +pub use self::wav_loader::{WavLoader, WavParams}; /// Capability trait for handlers that expose audio content. /// diff --git a/crates/nvisy-codec/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs index 8f1063f..140ade5 100644 --- a/crates/nvisy-codec/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -16,14 +16,14 @@ mod jpeg_loader; mod png_handler; mod png_loader; -pub use image_data::ImageData; -pub use image_handler::BoxedImageHandler; -pub(crate) use image_handler_macro::impl_image_handler; -pub use image_span_id::ImageSpanId; -pub use jpeg_handler::JpegHandler; -pub use jpeg_loader::{JpegLoader, JpegParams}; -pub use png_handler::PngHandler; -pub use png_loader::{PngLoader, PngParams}; +pub use self::image_data::ImageData; +pub use self::image_handler::BoxedImageHandler; +pub(crate) use self::image_handler_macro::impl_image_handler; +pub use self::image_span_id::ImageSpanId; +pub use self::jpeg_handler::JpegHandler; +pub use self::jpeg_loader::{JpegLoader, JpegParams}; +pub use self::png_handler::PngHandler; +pub use self::png_loader::{PngLoader, PngParams}; /// Capability trait for handlers that expose image content. /// diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index e9cba02..95de961 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -17,11 +17,12 @@ mod image; mod rich; mod text; -pub use audio::*; -pub use image::*; use nvisy_core::content::ContentSource; -pub use rich::*; -pub use text::*; + +pub use self::audio::*; +pub use self::image::*; +pub use self::rich::*; +pub use self::text::*; /// Base trait implemented by all format handlers. /// diff --git a/crates/nvisy-codec/src/handler/rich/mod.rs b/crates/nvisy-codec/src/handler/rich/mod.rs index aa1a7e0..a8aaede 100644 --- a/crates/nvisy-codec/src/handler/rich/mod.rs +++ b/crates/nvisy-codec/src/handler/rich/mod.rs @@ -13,9 +13,9 @@ mod docx_loader; mod rich_handler; #[cfg(feature = "docx")] -pub use docx_loader::{DocxLoader, DocxParams}; +pub use self::docx_loader::{DocxLoader, DocxParams}; #[cfg(feature = "pdf")] -pub use pdf_handler::{RichTextHandler, RichTextSpan}; +pub use self::pdf_handler::{RichTextHandler, RichTextSpan}; #[cfg(feature = "pdf")] -pub use pdf_loader::{PdfLoader, PdfParams}; -pub use rich_handler::BoxedRichHandler; +pub use self::pdf_loader::{PdfLoader, PdfParams}; +pub use self::rich_handler::BoxedRichHandler; diff --git a/crates/nvisy-codec/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs index c9c0b3c..80e1693 100644 --- a/crates/nvisy-codec/src/handler/text/mod.rs +++ b/crates/nvisy-codec/src/handler/text/mod.rs @@ -25,22 +25,22 @@ mod xlsx_handler; #[cfg(feature = "xlsx")] mod xlsx_loader; -pub use csv_handler::{CsvData, CsvHandler, CsvSpan}; -pub use csv_loader::{CsvLoader, CsvParams}; +pub use self::csv_handler::{CsvData, CsvHandler, CsvSpan}; +pub use self::csv_loader::{CsvLoader, CsvParams}; #[cfg(feature = "html")] -pub use html_handler::{HtmlData, HtmlHandler, HtmlSpan}; +pub use self::html_handler::{HtmlData, HtmlHandler, HtmlSpan}; #[cfg(feature = "html")] -pub use html_loader::{HtmlLoader, HtmlParams}; -pub use json_handler::{JsonData, JsonHandler, JsonIndent, JsonPath}; -pub use json_loader::{JsonLoader, JsonParams}; -pub use text_data::TextData; -pub use text_handler::BoxedTextHandler; -pub use txt_handler::{TxtHandler, TxtSpan}; -pub use txt_loader::{TxtLoader, TxtParams}; +pub use self::html_loader::{HtmlLoader, HtmlParams}; +pub use self::json_handler::{JsonData, JsonHandler, JsonIndent, JsonPath}; +pub use self::json_loader::{JsonLoader, JsonParams}; +pub use self::text_data::TextData; +pub use self::text_handler::BoxedTextHandler; +pub use self::txt_handler::{TxtHandler, TxtSpan}; +pub use self::txt_loader::{TxtLoader, TxtParams}; #[cfg(feature = "xlsx")] -pub use xlsx_handler::XlsxHandler; +pub use self::xlsx_handler::XlsxHandler; #[cfg(feature = "xlsx")] -pub use xlsx_loader::{XlsxLoader, XlsxParams}; +pub use self::xlsx_loader::{XlsxLoader, XlsxParams}; /// Capability trait for handlers that expose text content. /// diff --git a/crates/nvisy-codec/src/lib.rs b/crates/nvisy-codec/src/lib.rs index 4bf9917..90f92dc 100644 --- a/crates/nvisy-codec/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -6,7 +6,7 @@ mod document; pub mod handler; pub mod transform; -pub use document::{Document, Span, SpanStream}; +pub use self::document::{Document, Span, SpanStream}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-codec/src/transform/audio/mod.rs b/crates/nvisy-codec/src/transform/audio/mod.rs index d45e2a8..3c7d3a8 100644 --- a/crates/nvisy-codec/src/transform/audio/mod.rs +++ b/crates/nvisy-codec/src/transform/audio/mod.rs @@ -3,5 +3,5 @@ mod instruction; mod transform; -pub use instruction::{AudioOutput, AudioRedaction}; -pub use transform::AudioTransform; +pub use self::instruction::{AudioOutput, AudioRedaction}; +pub use self::transform::AudioTransform; diff --git a/crates/nvisy-codec/src/transform/image/mod.rs b/crates/nvisy-codec/src/transform/image/mod.rs index e40f412..8ff18a9 100644 --- a/crates/nvisy-codec/src/transform/image/mod.rs +++ b/crates/nvisy-codec/src/transform/image/mod.rs @@ -4,5 +4,5 @@ mod instruction; mod ops; mod transform; -pub use instruction::{ImageOutput, ImageRedaction}; -pub use transform::ImageTransform; +pub use self::instruction::{ImageOutput, ImageRedaction}; +pub use self::transform::ImageTransform; diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs index 98f67a6..823d80a 100644 --- a/crates/nvisy-codec/src/transform/mod.rs +++ b/crates/nvisy-codec/src/transform/mod.rs @@ -4,6 +4,6 @@ mod audio; mod image; mod text; -pub use audio::{AudioOutput, AudioRedaction, AudioTransform}; -pub use image::{ImageOutput, ImageRedaction, ImageTransform}; -pub use text::{TextOutput, TextRedaction, TextTransform}; +pub use self::audio::{AudioOutput, AudioRedaction, AudioTransform}; +pub use self::image::{ImageOutput, ImageRedaction, ImageTransform}; +pub use self::text::{TextOutput, TextRedaction, TextTransform}; diff --git a/crates/nvisy-codec/src/transform/text/mod.rs b/crates/nvisy-codec/src/transform/text/mod.rs index 4235678..0993a6d 100644 --- a/crates/nvisy-codec/src/transform/text/mod.rs +++ b/crates/nvisy-codec/src/transform/text/mod.rs @@ -3,5 +3,5 @@ mod instruction; mod transform; -pub use instruction::{TextOutput, TextRedaction}; -pub use transform::TextTransform; +pub use self::instruction::{TextOutput, TextRedaction}; +pub use self::transform::TextTransform; diff --git a/crates/nvisy-core/src/content/mod.rs b/crates/nvisy-core/src/content/mod.rs index 7783432..26e4bb2 100644 --- a/crates/nvisy-core/src/content/mod.rs +++ b/crates/nvisy-core/src/content/mod.rs @@ -13,10 +13,10 @@ mod data_reference; mod encoding; mod source; -pub use bundle::Content; -pub use content_bytes::ContentBytes; -pub use content_data::ContentData; -pub use content_metadata::ContentMetadata; -pub use data_reference::DataReference; -pub use encoding::TextEncoding; -pub use source::ContentSource; +pub use self::bundle::Content; +pub use self::content_bytes::ContentBytes; +pub use self::content_data::ContentData; +pub use self::content_metadata::ContentMetadata; +pub use self::data_reference::DataReference; +pub use self::encoding::TextEncoding; +pub use self::source::ContentSource; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 8e0e634..ef2c913 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -7,7 +7,7 @@ pub mod math; pub mod media; mod error; -pub use error::{Error, ErrorKind, Result}; +pub use self::error::{Error, ErrorKind, Result}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/math/mod.rs b/crates/nvisy-core/src/math/mod.rs index 0c67712..69fa209 100644 --- a/crates/nvisy-core/src/math/mod.rs +++ b/crates/nvisy-core/src/math/mod.rs @@ -8,7 +8,7 @@ mod dpi; mod polygon; mod time_span; -pub use bounding_box::{BoundingBox, BoundingBoxPixel}; -pub use dpi::Dpi; -pub use polygon::{Polygon, Vertex}; -pub use time_span::TimeSpan; +pub use self::bounding_box::{BoundingBox, BoundingBoxPixel}; +pub use self::dpi::Dpi; +pub use self::polygon::{Polygon, Vertex}; +pub use self::time_span::TimeSpan; diff --git a/crates/nvisy-core/src/media/mod.rs b/crates/nvisy-core/src/media/mod.rs index 68ba898..e6adb9f 100644 --- a/crates/nvisy-core/src/media/mod.rs +++ b/crates/nvisy-core/src/media/mod.rs @@ -6,8 +6,8 @@ mod content_kind; mod document_type; -pub use content_kind::ContentKind; -pub use document_type::{ +pub use self::content_kind::ContentKind; +pub use self::document_type::{ AudioFormat, DocumentType, ImageFormat, PresentationFormat, SpreadsheetFormat, TextFormat, WordFormat, }; diff --git a/crates/nvisy-engine/src/compiler/graph/mod.rs b/crates/nvisy-engine/src/compiler/graph/mod.rs index d8d5310..221de0b 100644 --- a/crates/nvisy-engine/src/compiler/graph/mod.rs +++ b/crates/nvisy-engine/src/compiler/graph/mod.rs @@ -11,15 +11,15 @@ mod target; use std::collections::HashSet; -pub use action::{ActionKind, ActionNode}; use nvisy_core::Error; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use source::SourceNode; -pub use target::TargetNode; use uuid::Uuid; use validator::Validate; +pub use self::action::{ActionKind, ActionNode}; +pub use self::source::SourceNode; +pub use self::target::TargetNode; use super::policy::{RetryPolicy, TimeoutPolicy}; /// A node in the pipeline graph. diff --git a/crates/nvisy-engine/src/compiler/mod.rs b/crates/nvisy-engine/src/compiler/mod.rs index 9ce2b6a..6ac7baa 100644 --- a/crates/nvisy-engine/src/compiler/mod.rs +++ b/crates/nvisy-engine/src/compiler/mod.rs @@ -10,16 +10,17 @@ mod policy; use std::collections::HashMap; -pub use graph::{ - ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, -}; use nvisy_core::Error; use petgraph::algo::{is_cyclic_directed, toposort}; use petgraph::graph::{DiGraph, NodeIndex}; -pub(crate) use plan::{ExecutionPlan, ResolvedNode}; -pub use policy::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; use uuid::Uuid; +pub use self::graph::{ + ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, +}; +pub(crate) use self::plan::{ExecutionPlan, ResolvedNode}; +pub use self::policy::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; + /// Pipeline compiler with optional default policies. /// /// Nodes that don't carry their own retry or timeout policy will inherit diff --git a/crates/nvisy-engine/src/compiler/policy/mod.rs b/crates/nvisy-engine/src/compiler/policy/mod.rs index 13adfb4..769e82f 100644 --- a/crates/nvisy-engine/src/compiler/policy/mod.rs +++ b/crates/nvisy-engine/src/compiler/policy/mod.rs @@ -3,5 +3,5 @@ mod retry; mod timeout; -pub use retry::{BackoffStrategy, RetryPolicy}; -pub use timeout::{TimeoutBehavior, TimeoutPolicy}; +pub use self::retry::{BackoffStrategy, RetryPolicy}; +pub use self::timeout::{TimeoutBehavior, TimeoutPolicy}; diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index cbe9293..0872713 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -8,11 +8,11 @@ pub mod pipeline; pub mod provenance; // Re-export graph data model for pipeline definitions. -pub use compiler::{ +pub use self::compiler::{ ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, }; // Re-export retry and timeout policies for pipeline nodes. -pub use compiler::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; -pub use pipeline::{ +pub use self::compiler::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; +pub use self::pipeline::{ DefaultEngine, EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection, }; diff --git a/crates/nvisy-engine/src/operation/context/mod.rs b/crates/nvisy-engine/src/operation/context/mod.rs index 1b99b41..01cffa9 100644 --- a/crates/nvisy-engine/src/operation/context/mod.rs +++ b/crates/nvisy-engine/src/operation/context/mod.rs @@ -23,9 +23,9 @@ mod parallel; mod sequential; mod shared; -pub use parallel::ParallelContext; -pub use sequential::SequentialContext; -pub use shared::SharedContext; +pub use self::parallel::ParallelContext; +pub use self::sequential::SequentialContext; +pub use self::shared::SharedContext; pub(crate) mod private { pub trait Sealed {} diff --git a/crates/nvisy-engine/src/operation/envelope/mod.rs b/crates/nvisy-engine/src/operation/envelope/mod.rs index d19e33f..ff211db 100644 --- a/crates/nvisy-engine/src/operation/envelope/mod.rs +++ b/crates/nvisy-engine/src/operation/envelope/mod.rs @@ -27,13 +27,13 @@ mod audit; mod detection; mod policy; -pub use apply::ApplyPatch; -pub use audit::OperationEntry; -pub use detection::{DetectedEntities, RefinedEntities}; use nvisy_codec::Document; use nvisy_ontology::entity::Entities; -pub use policy::PolicyOutcome; +pub use self::apply::ApplyPatch; +pub use self::audit::OperationEntry; +pub use self::detection::{DetectedEntities, RefinedEntities}; +pub use self::policy::PolicyOutcome; use crate::provenance::Audit; /// Per-document state that flows through the entire pipeline. diff --git a/crates/nvisy-engine/src/operation/inference/mod.rs b/crates/nvisy-engine/src/operation/inference/mod.rs index 6891511..5a99c2c 100644 --- a/crates/nvisy-engine/src/operation/inference/mod.rs +++ b/crates/nvisy-engine/src/operation/inference/mod.rs @@ -27,11 +27,11 @@ mod summarization; mod transcription; mod translation; -pub use classification::Classification; -pub use computer_vision::ComputerVision; -pub use ner::{Ner, NerMethodParams}; -pub use ocr::Ocr; -pub use ocr_verification::{OcrVerification, OcrVerificationInput}; -pub use summarization::Summarization; -pub use transcription::Transcription; -pub use translation::Translation; +pub use self::classification::Classification; +pub use self::computer_vision::ComputerVision; +pub use self::ner::{Ner, NerMethodParams}; +pub use self::ocr::Ocr; +pub use self::ocr_verification::{OcrVerification, OcrVerificationInput}; +pub use self::summarization::Summarization; +pub use self::transcription::Transcription; +pub use self::translation::Translation; diff --git a/crates/nvisy-engine/src/operation/lifecycle/mod.rs b/crates/nvisy-engine/src/operation/lifecycle/mod.rs index d6b0bb4..65d17a0 100644 --- a/crates/nvisy-engine/src/operation/lifecycle/mod.rs +++ b/crates/nvisy-engine/src/operation/lifecycle/mod.rs @@ -24,10 +24,10 @@ mod encryption; mod export; mod import; -pub use compression::Compression; -pub use conversion::Conversion; -pub use decompression::Decompression; -pub use decryption::Decryption; -pub use encryption::Encryption; -pub use export::Export; -pub use import::Import; +pub use self::compression::Compression; +pub use self::conversion::Conversion; +pub use self::decompression::Decompression; +pub use self::decryption::Decryption; +pub use self::encryption::Encryption; +pub use self::export::Export; +pub use self::import::Import; diff --git a/crates/nvisy-engine/src/operation/mod.rs b/crates/nvisy-engine/src/operation/mod.rs index caa642c..14473fd 100644 --- a/crates/nvisy-engine/src/operation/mod.rs +++ b/crates/nvisy-engine/src/operation/mod.rs @@ -23,10 +23,11 @@ pub mod utility; use std::future::Future; -pub use context::{OperationContext, ParallelContext, SequentialContext, SharedContext}; -pub use envelope::DocumentEnvelope; use nvisy_core::Result; +pub use self::context::{OperationContext, ParallelContext, SequentialContext, SharedContext}; +pub use self::envelope::DocumentEnvelope; + /// A single unit of work in the redaction pipeline. /// /// Operations are stateless and composable. The engine calls [`Operation::call`] diff --git a/crates/nvisy-engine/src/operation/processing/mod.rs b/crates/nvisy-engine/src/operation/processing/mod.rs index 32eb85b..a3e9564 100644 --- a/crates/nvisy-engine/src/operation/processing/mod.rs +++ b/crates/nvisy-engine/src/operation/processing/mod.rs @@ -22,12 +22,12 @@ mod policy_evaluation; mod redaction; mod validation; -pub use deduplication::Deduplication; -pub use ensemble_fusion::{Ensemble, FusionStrategy}; -pub use manual_detection::{ +pub use self::deduplication::Deduplication; +pub use self::ensemble_fusion::{Ensemble, FusionStrategy}; +pub use self::manual_detection::{ Exclusion, ManualDetection, ManualDetectionParams, ManualOutput, is_excluded, }; -pub use pattern_match::{PatternDetectionParams, PatternInput, PatternMatch}; -pub use policy_evaluation::{EvaluatePolicy, EvaluatePolicyParams}; -pub use redaction::{Redaction, RedactionInput, RedactionOutput}; -pub use validation::Validation; +pub use self::pattern_match::{PatternDetectionParams, PatternMatch}; +pub use self::policy_evaluation::{EvaluatePolicy, EvaluatePolicyParams}; +pub use self::redaction::{Redaction, RedactionInput, RedactionOutput}; +pub use self::validation::Validation; diff --git a/crates/nvisy-engine/src/pipeline/mod.rs b/crates/nvisy-engine/src/pipeline/mod.rs index dd15995..350a098 100644 --- a/crates/nvisy-engine/src/pipeline/mod.rs +++ b/crates/nvisy-engine/src/pipeline/mod.rs @@ -16,17 +16,19 @@ mod runs; use std::future::Future; -pub use config::{EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection}; -pub use default::DefaultEngine; -pub use executor::{NodeOutput, RunOutput}; use nvisy_core::Error; use nvisy_ontology::context::Contexts; use nvisy_ontology::entity::DetectionOutput; use nvisy_ontology::policy::{Policies, RedactionSummary}; -pub use ontology::{Explainable, Explanation}; -pub use runs::{NodeProgress, RunManager, RunState, RunStatus, RunSummary}; use uuid::Uuid; +pub use self::config::{ + EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection, +}; +pub use self::default::DefaultEngine; +pub use self::executor::{NodeOutput, RunOutput}; +pub use self::ontology::{Explainable, Explanation}; +pub use self::runs::{NodeProgress, RunManager, RunState, RunStatus, RunSummary}; use crate::compiler::Graph; use crate::provenance::{Audit, PolicyEvaluation, RedactionMap}; diff --git a/crates/nvisy-engine/src/pipeline/policy/mod.rs b/crates/nvisy-engine/src/pipeline/policy/mod.rs index 5fd868a..a5b29ec 100644 --- a/crates/nvisy-engine/src/pipeline/policy/mod.rs +++ b/crates/nvisy-engine/src/pipeline/policy/mod.rs @@ -9,5 +9,5 @@ mod retry; mod timeout; -pub use retry::CompiledRetryPolicy; -pub use timeout::CompiledTimeoutPolicy; +pub use self::retry::CompiledRetryPolicy; +pub use self::timeout::CompiledTimeoutPolicy; diff --git a/crates/nvisy-engine/src/provenance/action/mod.rs b/crates/nvisy-engine/src/provenance/action/mod.rs index 3fac7e8..ffba31f 100644 --- a/crates/nvisy-engine/src/provenance/action/mod.rs +++ b/crates/nvisy-engine/src/provenance/action/mod.rs @@ -7,6 +7,6 @@ mod inference; mod lifecycle; mod processing; -pub use inference::{InferenceAction, InferenceActionBuilder}; -pub use lifecycle::{LifecycleAction, LifecycleActionBuilder}; -pub use processing::{ProcessingAction, ProcessingActionBuilder}; +pub use self::inference::{InferenceAction, InferenceActionBuilder}; +pub use self::lifecycle::{LifecycleAction, LifecycleActionBuilder}; +pub use self::processing::{ProcessingAction, ProcessingActionBuilder}; diff --git a/crates/nvisy-engine/src/provenance/mod.rs b/crates/nvisy-engine/src/provenance/mod.rs index ad54b5c..bc888b2 100644 --- a/crates/nvisy-engine/src/provenance/mod.rs +++ b/crates/nvisy-engine/src/provenance/mod.rs @@ -19,14 +19,14 @@ mod kind; mod action; mod record; -pub use action::{ +pub use self::action::{ InferenceAction, InferenceActionBuilder, LifecycleAction, LifecycleActionBuilder, ProcessingAction, ProcessingActionBuilder, }; -pub use audit::Audit; -pub use entry::{AuditEntry, AuditEntryBuilder, AuditEntryBuilderError, AuditEntryStatus}; -pub use kind::{AuditEntryKind, InferenceKind, LifecycleKind, ProcessingKind}; -pub use record::{ +pub use self::audit::Audit; +pub use self::entry::{AuditEntry, AuditEntryBuilder, AuditEntryBuilderError, AuditEntryStatus}; +pub use self::kind::{AuditEntryKind, InferenceKind, LifecycleKind, ProcessingKind}; +pub use self::record::{ PolicyEvaluation, RedactionDecision, RedactionMap, RedactionMapEntry, RedactionRecord, ReviewDecision, ReviewStatus, }; diff --git a/crates/nvisy-engine/src/provenance/record/mod.rs b/crates/nvisy-engine/src/provenance/record/mod.rs index 7d9cdac..e65bf95 100644 --- a/crates/nvisy-engine/src/provenance/record/mod.rs +++ b/crates/nvisy-engine/src/provenance/record/mod.rs @@ -7,8 +7,8 @@ mod map; mod redaction; mod review; -pub use decision::RedactionDecision; -pub use evaluation::PolicyEvaluation; -pub use map::{RedactionMap, RedactionMapEntry}; -pub use redaction::RedactionRecord; -pub use review::{ReviewDecision, ReviewStatus}; +pub use self::decision::RedactionDecision; +pub use self::evaluation::PolicyEvaluation; +pub use self::map::{RedactionMap, RedactionMapEntry}; +pub use self::redaction::RedactionRecord; +pub use self::review::{ReviewDecision, ReviewStatus}; diff --git a/crates/nvisy-http/src/lib.rs b/crates/nvisy-http/src/lib.rs index 7ab3e9b..b6aaad6 100644 --- a/crates/nvisy-http/src/lib.rs +++ b/crates/nvisy-http/src/lib.rs @@ -5,7 +5,7 @@ mod client; mod middleware; -pub use client::{HttpClient, HttpConfig}; +pub use self::client::{HttpClient, HttpConfig}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs index f3ad815..10adbcd 100644 --- a/crates/nvisy-ocr/src/backend/mod.rs +++ b/crates/nvisy-ocr/src/backend/mod.rs @@ -3,13 +3,14 @@ mod input; mod output; -pub use input::ImageInput; use nvisy_core::Error; pub use nvisy_core::media::ImageFormat; -pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word}; use reqwest_middleware::reqwest::Response; use reqwest_middleware::reqwest::multipart::Part; +pub use self::input::ImageInput; +pub use self::output::{Block, BlockKind, ImageOutput, Line, Page, Word}; + /// Build a multipart [`Part`] from an [`ImageInput`]. pub(crate) fn image_part(image: &ImageInput) -> Result { let filename = format!("image.{}", image.format.extension()); diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs index 6a6c802..7893219 100644 --- a/crates/nvisy-ocr/src/engine/mod.rs +++ b/crates/nvisy-ocr/src/engine/mod.rs @@ -6,9 +6,9 @@ use std::fmt; use std::sync::Arc; use nvisy_core::Error; -pub use params::OcrProvider; use tracing::instrument; +pub use self::params::OcrProvider; use crate::backend::{Backend, ImageInput, ImageOutput, RunParams}; /// Type-erased OCR engine wrapping any [`Backend`] implementation. diff --git a/crates/nvisy-ocr/src/lib.rs b/crates/nvisy-ocr/src/lib.rs index ae0dfba..e1078f7 100644 --- a/crates/nvisy-ocr/src/lib.rs +++ b/crates/nvisy-ocr/src/lib.rs @@ -9,7 +9,7 @@ pub mod provider; #[doc(hidden)] pub mod prelude; -pub use backend::{ +pub use self::backend::{ Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word, }; -pub use engine::{OcrEngine, OcrProvider}; +pub use self::engine::{OcrEngine, OcrProvider}; diff --git a/crates/nvisy-ocr/src/provider/aws_textract/mod.rs b/crates/nvisy-ocr/src/provider/aws_textract/mod.rs index 26d9ca7..2bae938 100644 --- a/crates/nvisy-ocr/src/provider/aws_textract/mod.rs +++ b/crates/nvisy-ocr/src/provider/aws_textract/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::AwsTextractBackend; -pub use params::AwsTextractParams; +pub use self::backend::AwsTextractBackend; +pub use self::params::AwsTextractParams; diff --git a/crates/nvisy-ocr/src/provider/azure_docai/mod.rs b/crates/nvisy-ocr/src/provider/azure_docai/mod.rs index 24f4622..5c09835 100644 --- a/crates/nvisy-ocr/src/provider/azure_docai/mod.rs +++ b/crates/nvisy-ocr/src/provider/azure_docai/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::AzureDocaiBackend; -pub use params::AzureDocaiParams; +pub use self::backend::AzureDocaiBackend; +pub use self::params::AzureDocaiParams; diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs b/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs index bb595da..793fe5c 100644 --- a/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs +++ b/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::SuryaBackend; -pub use params::SuryaParams; +pub use self::backend::SuryaBackend; +pub use self::params::SuryaParams; diff --git a/crates/nvisy-ocr/src/provider/google_vision/mod.rs b/crates/nvisy-ocr/src/provider/google_vision/mod.rs index 803dda9..e45502e 100644 --- a/crates/nvisy-ocr/src/provider/google_vision/mod.rs +++ b/crates/nvisy-ocr/src/provider/google_vision/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::GoogleVisionBackend; -pub use params::GoogleVisionParams; +pub use self::backend::GoogleVisionBackend; +pub use self::params::GoogleVisionParams; diff --git a/crates/nvisy-ocr/src/provider/mod.rs b/crates/nvisy-ocr/src/provider/mod.rs index bf4b783..b4509b4 100644 --- a/crates/nvisy-ocr/src/provider/mod.rs +++ b/crates/nvisy-ocr/src/provider/mod.rs @@ -3,8 +3,8 @@ mod datalab_surya; mod paddle_paddlex; -pub use datalab_surya::{SuryaBackend, SuryaParams}; -pub use paddle_paddlex::{PaddleXBackend, PaddleXParams}; +pub use self::datalab_surya::{SuryaBackend, SuryaParams}; +pub use self::paddle_paddlex::{PaddleXBackend, PaddleXParams}; #[cfg(feature = "aws-textract")] #[cfg_attr(docsrs, doc(cfg(feature = "aws-textract")))] @@ -18,10 +18,10 @@ mod google_vision; #[cfg(feature = "aws-textract")] #[cfg_attr(docsrs, doc(cfg(feature = "aws-textract")))] -pub use aws_textract::{AwsTextractBackend, AwsTextractParams}; +pub use self::aws_textract::{AwsTextractBackend, AwsTextractParams}; #[cfg(feature = "azure-docai")] #[cfg_attr(docsrs, doc(cfg(feature = "azure-docai")))] -pub use azure_docai::{AzureDocaiBackend, AzureDocaiParams}; +pub use self::azure_docai::{AzureDocaiBackend, AzureDocaiParams}; #[cfg(feature = "google-vision")] #[cfg_attr(docsrs, doc(cfg(feature = "google-vision")))] -pub use google_vision::{GoogleVisionBackend, GoogleVisionParams}; +pub use self::google_vision::{GoogleVisionBackend, GoogleVisionParams}; diff --git a/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs b/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs index 5c02e82..7d6f28e 100644 --- a/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs +++ b/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::PaddleXBackend; -pub use params::PaddleXParams; +pub use self::backend::PaddleXBackend; +pub use self::params::PaddleXParams; diff --git a/crates/nvisy-ontology/src/context/analytic/mod.rs b/crates/nvisy-ontology/src/context/analytic/mod.rs index 841d9a9..5b2c7ae 100644 --- a/crates/nvisy-ontology/src/context/analytic/mod.rs +++ b/crates/nvisy-ontology/src/context/analytic/mod.rs @@ -3,11 +3,12 @@ mod embedding; mod pattern; -pub use embedding::EmbeddingData; -pub use pattern::PatternData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::embedding::EmbeddingData; +pub use self::pattern::PatternData; + /// Analytic computation variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/context/biometric/mod.rs b/crates/nvisy-ontology/src/context/biometric/mod.rs index 79f9dec..de235e5 100644 --- a/crates/nvisy-ontology/src/context/biometric/mod.rs +++ b/crates/nvisy-ontology/src/context/biometric/mod.rs @@ -3,10 +3,11 @@ mod face; mod voice; -pub use face::FaceData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use voice::VoiceData; + +pub use self::face::FaceData; +pub use self::voice::VoiceData; /// Biometric identity verification variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/document/mod.rs b/crates/nvisy-ontology/src/context/document/mod.rs index 736aaa5..b34d15e 100644 --- a/crates/nvisy-ontology/src/context/document/mod.rs +++ b/crates/nvisy-ontology/src/context/document/mod.rs @@ -5,8 +5,9 @@ mod template; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use signature::SignatureData; -pub use template::TemplateData; + +pub use self::signature::SignatureData; +pub use self::template::TemplateData; /// Document-related reference variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/geospatial/mod.rs b/crates/nvisy-ontology/src/context/geospatial/mod.rs index e342651..0e81d0b 100644 --- a/crates/nvisy-ontology/src/context/geospatial/mod.rs +++ b/crates/nvisy-ontology/src/context/geospatial/mod.rs @@ -4,12 +4,13 @@ mod address; mod coordinates; mod region; -pub use address::AddressData; -pub use coordinates::GeoCoordinate; -pub use region::{GeoBounds, GeoShape, RegionData}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::address::AddressData; +pub use self::coordinates::GeoCoordinate; +pub use self::region::{GeoBounds, GeoShape, RegionData}; + /// Geospatial location variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/context/mod.rs b/crates/nvisy-ontology/src/context/mod.rs index e49872f..9c29315 100644 --- a/crates/nvisy-ontology/src/context/mod.rs +++ b/crates/nvisy-ontology/src/context/mod.rs @@ -12,11 +12,12 @@ pub mod geospatial; pub mod reference; pub mod temporal; -pub use entry::{ContextEntry, ContextEntryData}; use nvisy_core::content::ContentSource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::entry::{ContextEntry, ContextEntryData}; + /// A collection of [`Context`]s attached to a pipeline run. #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] pub struct Contexts { diff --git a/crates/nvisy-ontology/src/context/reference/mod.rs b/crates/nvisy-ontology/src/context/reference/mod.rs index d6ba016..e7b0be2 100644 --- a/crates/nvisy-ontology/src/context/reference/mod.rs +++ b/crates/nvisy-ontology/src/context/reference/mod.rs @@ -5,12 +5,13 @@ mod image; mod tag; mod text; -pub use credential::CredentialData; -pub use image::ImageData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use tag::TagData; -pub use text::{TextData, TextEntry}; + +pub use self::credential::CredentialData; +pub use self::image::ImageData; +pub use self::tag::TagData; +pub use self::text::{TextData, TextEntry}; /// Direct comparison reference variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/temporal/mod.rs b/crates/nvisy-ontology/src/context/temporal/mod.rs index 5fa8f72..7055d35 100644 --- a/crates/nvisy-ontology/src/context/temporal/mod.rs +++ b/crates/nvisy-ontology/src/context/temporal/mod.rs @@ -2,10 +2,11 @@ mod date; -pub use date::DateData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::date::DateData; + /// Temporal matching variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/entity/location/mod.rs b/crates/nvisy-ontology/src/entity/location/mod.rs index 14a8b19..3bf5cfc 100644 --- a/crates/nvisy-ontology/src/entity/location/mod.rs +++ b/crates/nvisy-ontology/src/entity/location/mod.rs @@ -5,13 +5,14 @@ mod image; mod tabular; mod text; -pub use audio::AudioLocation; use derive_more::From; -pub use image::ImageLocation; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use tabular::TabularLocation; -pub use text::TextLocation; + +pub use self::audio::AudioLocation; +pub use self::image::ImageLocation; +pub use self::tabular::TabularLocation; +pub use self::text::TextLocation; /// A modality-specific location for a detected entity. /// diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-ontology/src/policy/mod.rs index 39696ff..de53365 100644 --- a/crates/nvisy-ontology/src/policy/mod.rs +++ b/crates/nvisy-ontology/src/policy/mod.rs @@ -7,9 +7,9 @@ mod strategy; mod summary; mod types; -pub use retention::{Retention, RetentionPolicy, RetentionScope}; -pub use rule::{PolicyRule, RuleAction, RuleCondition}; -pub use selector::EntitySelector; -pub use strategy::{AudioStrategy, ImageStrategy, Strategy, TextStrategy}; -pub use summary::RedactionSummary; -pub use types::{Policies, Policy}; +pub use self::retention::{Retention, RetentionPolicy, RetentionScope}; +pub use self::rule::{PolicyRule, RuleAction, RuleCondition}; +pub use self::selector::EntitySelector; +pub use self::strategy::{AudioStrategy, ImageStrategy, Strategy, TextStrategy}; +pub use self::summary::RedactionSummary; +pub use self::types::{Policies, Policy}; diff --git a/crates/nvisy-ontology/src/policy/strategy/mod.rs b/crates/nvisy-ontology/src/policy/strategy/mod.rs index a43dd0e..a75968c 100644 --- a/crates/nvisy-ontology/src/policy/strategy/mod.rs +++ b/crates/nvisy-ontology/src/policy/strategy/mod.rs @@ -9,12 +9,13 @@ mod audio; mod image; mod text; -pub use audio::AudioStrategy; use derive_more::From; -pub use image::ImageStrategy; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use text::TextStrategy; + +pub use self::audio::AudioStrategy; +pub use self::image::ImageStrategy; +pub use self::text::TextStrategy; /// Unified redaction strategy across all modalities. /// diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index 9e9a614..cf3485d 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -6,13 +6,14 @@ mod error; -pub use error::from_pyerr; use hipstr::HipStr; use nvisy_core::Error; use pyo3::prelude::*; use pyo3::types::PyDict; use serde_json::Value; +pub use self::error::from_pyerr; + /// Lightweight handle to a Python NER module. /// /// The bridge does **not** hold the GIL or any Python objects; it simply diff --git a/crates/nvisy-registry/src/lib.rs b/crates/nvisy-registry/src/lib.rs index f8fa953..6e79dba 100644 --- a/crates/nvisy-registry/src/lib.rs +++ b/crates/nvisy-registry/src/lib.rs @@ -16,4 +16,4 @@ mod store; #[doc(hidden)] pub mod prelude; -pub use store::{ContentHandle, ContextHandle, Registry}; +pub use self::store::{ContentHandle, ContextHandle, Registry}; diff --git a/crates/nvisy-registry/src/store/mod.rs b/crates/nvisy-registry/src/store/mod.rs index 83c342e..b53d183 100644 --- a/crates/nvisy-registry/src/store/mod.rs +++ b/crates/nvisy-registry/src/store/mod.rs @@ -2,6 +2,6 @@ mod content; mod context; mod registry; -pub use content::ContentHandle; -pub use context::ContextHandle; -pub use registry::Registry; +pub use self::content::ContentHandle; +pub use self::context::ContextHandle; +pub use self::registry::Registry; diff --git a/crates/nvisy-rig/src/agent/base/mod.rs b/crates/nvisy-rig/src/agent/base/mod.rs index 00c1d81..97fba78 100644 --- a/crates/nvisy-rig/src/agent/base/mod.rs +++ b/crates/nvisy-rig/src/agent/base/mod.rs @@ -7,11 +7,11 @@ mod detection; mod provider; mod response; -pub use agent::AgentConfig; -pub(crate) use agent::{Agents, BaseAgent}; -pub(crate) use builder::BaseAgentBuilder; -pub use context::ContextWindow; -pub(crate) use detection::ALL_TYPES_HINT; -pub use detection::{DetectionConfig, DetectionRequest, DetectionResponse}; -pub use provider::AgentProvider; -pub(crate) use response::ResponseParser; +pub use self::agent::AgentConfig; +pub(crate) use self::agent::{Agents, BaseAgent}; +pub(crate) use self::builder::BaseAgentBuilder; +pub use self::context::ContextWindow; +pub(crate) use self::detection::ALL_TYPES_HINT; +pub use self::detection::{DetectionConfig, DetectionRequest, DetectionResponse}; +pub use self::provider::AgentProvider; +pub(crate) use self::response::ResponseParser; diff --git a/crates/nvisy-rig/src/agent/cv/mod.rs b/crates/nvisy-rig/src/agent/cv/mod.rs index 6bb0199..30cb2ec 100644 --- a/crates/nvisy-rig/src/agent/cv/mod.rs +++ b/crates/nvisy-rig/src/agent/cv/mod.rs @@ -12,12 +12,12 @@ mod tool; use async_trait::async_trait; use base64::Engine; use base64::engine::general_purpose::STANDARD; -pub use output::{CvEntities, CvEntity}; -use prompt::{CV_SYSTEM_PROMPT, CvPromptBuilder}; use serde::Serialize; -use tool::CvRigTool; use uuid::Uuid; +pub use self::output::{CvEntities, CvEntity}; +use self::prompt::{CV_SYSTEM_PROMPT, CvPromptBuilder}; +use self::tool::CvRigTool; use super::{AgentConfig, AgentProvider, BaseAgent, DetectionConfig}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/generate/mod.rs b/crates/nvisy-rig/src/agent/generate/mod.rs index 1c22a3a..6063687 100644 --- a/crates/nvisy-rig/src/agent/generate/mod.rs +++ b/crates/nvisy-rig/src/agent/generate/mod.rs @@ -8,10 +8,10 @@ mod output; mod prompt; use nvisy_ontology::entity::EntityKind; -pub use output::{GenOutput, GeneratedEntity}; -use prompt::{GEN_SYSTEM_PROMPT, GenPromptBuilder}; use uuid::Uuid; +pub use self::output::{GenOutput, GeneratedEntity}; +use self::prompt::{GEN_SYSTEM_PROMPT, GenPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/mod.rs b/crates/nvisy-rig/src/agent/mod.rs index b90da59..ad04d73 100644 --- a/crates/nvisy-rig/src/agent/mod.rs +++ b/crates/nvisy-rig/src/agent/mod.rs @@ -11,11 +11,15 @@ mod generate; mod ner; mod ocr; -pub(crate) use base::{ALL_TYPES_HINT, BaseAgent}; -pub use base::{ +pub(crate) use self::base::{ALL_TYPES_HINT, BaseAgent}; +pub use self::base::{ AgentConfig, AgentProvider, ContextWindow, DetectionConfig, DetectionRequest, DetectionResponse, }; -pub use cv::{CvAgent, CvDetection, CvEntities, CvEntity, CvProvider}; -pub use generate::{GenAgent, GenOutput, GenRequest, GeneratedEntity}; -pub use ner::{KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets}; -pub use ocr::{OcrAgent, ProposedEntity, VerificationOutput, VerificationStatus, VerifiedEntity}; +pub use self::cv::{CvAgent, CvDetection, CvEntities, CvEntity, CvProvider}; +pub use self::generate::{GenAgent, GenOutput, GenRequest, GeneratedEntity}; +pub use self::ner::{ + KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets, +}; +pub use self::ocr::{ + OcrAgent, ProposedEntity, VerificationOutput, VerificationStatus, VerifiedEntity, +}; diff --git a/crates/nvisy-rig/src/agent/ner/mod.rs b/crates/nvisy-rig/src/agent/ner/mod.rs index a1627b6..45dab21 100644 --- a/crates/nvisy-rig/src/agent/ner/mod.rs +++ b/crates/nvisy-rig/src/agent/ner/mod.rs @@ -8,12 +8,12 @@ mod context; mod output; mod prompt; -pub use context::NerContext; use nvisy_http::HttpClient; -pub use output::{KnownNerEntity, NerEntities, NerEntity, ResolvedOffsets}; -use prompt::{NER_SYSTEM_PROMPT, NerPromptBuilder}; use uuid::Uuid; +pub use self::context::NerContext; +pub use self::output::{KnownNerEntity, NerEntities, NerEntity, ResolvedOffsets}; +use self::prompt::{NER_SYSTEM_PROMPT, NerPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent, DetectionConfig}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/ocr/input.rs b/crates/nvisy-rig/src/agent/ocr/input.rs index 89b2c19..9308ce1 100644 --- a/crates/nvisy-rig/src/agent/ocr/input.rs +++ b/crates/nvisy-rig/src/agent/ocr/input.rs @@ -31,7 +31,7 @@ impl ProposedEntity { }; Self { id, - category: entity.category.clone(), + category: entity.category, entity_type: entity.entity_kind, value: entity.value.clone(), confidence: entity.confidence, diff --git a/crates/nvisy-rig/src/agent/ocr/mod.rs b/crates/nvisy-rig/src/agent/ocr/mod.rs index 9078c30..6f46a8b 100644 --- a/crates/nvisy-rig/src/agent/ocr/mod.rs +++ b/crates/nvisy-rig/src/agent/ocr/mod.rs @@ -11,12 +11,12 @@ mod prompt; use base64::Engine; use base64::engine::general_purpose::STANDARD; -pub use input::ProposedEntity; use nvisy_ontology::entity::Entity; -pub use output::{VerificationOutput, VerificationStatus, VerifiedEntity}; -use prompt::{OCR_SYSTEM_PROMPT, OcrPromptBuilder}; use uuid::Uuid; +pub use self::input::ProposedEntity; +pub use self::output::{VerificationOutput, VerificationStatus, VerifiedEntity}; +use self::prompt::{OCR_SYSTEM_PROMPT, OcrPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/ocr/prompt.rs b/crates/nvisy-rig/src/agent/ocr/prompt.rs index 75d3b2f..b61581c 100644 --- a/crates/nvisy-rig/src/agent/ocr/prompt.rs +++ b/crates/nvisy-rig/src/agent/ocr/prompt.rs @@ -78,7 +78,7 @@ mod tests { let entities = vec![ ProposedEntity { id: 0, - category: EntityCategory::Pii, + category: EntityCategory::PersonalIdentity, entity_type: EntityKind::PersonName, value: "John Doe".into(), confidence: 0.95, @@ -100,7 +100,7 @@ mod tests { ]; let prompt = OcrPromptBuilder::new(&entities).build("AAAA"); - assert!(prompt.contains("[0] category=pii")); + assert!(prompt.contains("[0] category=personal_identity")); assert!(prompt.contains("person_name")); assert!(prompt.contains("John Doe")); assert!(prompt.contains("bbox=[10.0, 20.0, 100.0, 30.0]")); diff --git a/crates/nvisy-rig/src/audio/mod.rs b/crates/nvisy-rig/src/audio/mod.rs index d76f5cf..c18f088 100644 --- a/crates/nvisy-rig/src/audio/mod.rs +++ b/crates/nvisy-rig/src/audio/mod.rs @@ -3,5 +3,5 @@ pub mod stt; pub mod tts; -pub use stt::SttProvider; -pub use tts::TtsProvider; +pub use self::stt::SttProvider; +pub use self::tts::TtsProvider; diff --git a/crates/nvisy-rig/src/audio/stt/mod.rs b/crates/nvisy-rig/src/audio/stt/mod.rs index 1995864..4c7d3f4 100644 --- a/crates/nvisy-rig/src/audio/stt/mod.rs +++ b/crates/nvisy-rig/src/audio/stt/mod.rs @@ -7,12 +7,12 @@ mod provider; use nvisy_http::HttpClient; -pub(crate) use provider::SttModels; -pub use provider::SttProvider; #[cfg(feature = "openai-whisper")] use rig::transcription::TranscriptionModel; use uuid::Uuid; +pub(crate) use self::provider::SttModels; +pub use self::provider::SttProvider; use crate::error::Error; const TARGET: &str = "nvisy_rig::stt"; diff --git a/crates/nvisy-rig/src/audio/tts/mod.rs b/crates/nvisy-rig/src/audio/tts/mod.rs index d3e5878..da84a80 100644 --- a/crates/nvisy-rig/src/audio/tts/mod.rs +++ b/crates/nvisy-rig/src/audio/tts/mod.rs @@ -3,12 +3,12 @@ mod provider; use nvisy_http::HttpClient; -pub(crate) use provider::TtsModels; -pub use provider::TtsProvider; #[cfg(feature = "openai-tts")] use rig::audio_generation::AudioGenerationModel as _; use uuid::Uuid; +pub(crate) use self::provider::TtsModels; +pub use self::provider::TtsProvider; use crate::error::Error; const TARGET: &str = "nvisy_rig::tts"; diff --git a/crates/nvisy-rig/src/backend/mod.rs b/crates/nvisy-rig/src/backend/mod.rs index d65dee0..525d27f 100644 --- a/crates/nvisy-rig/src/backend/mod.rs +++ b/crates/nvisy-rig/src/backend/mod.rs @@ -2,5 +2,5 @@ mod metrics; mod provider; -pub use metrics::{UsageStats, UsageTracker}; -pub use provider::{AuthenticatedProvider, UnauthenticatedProvider}; +pub use self::metrics::{UsageStats, UsageTracker}; +pub use self::provider::{AuthenticatedProvider, UnauthenticatedProvider}; diff --git a/crates/nvisy-rig/src/backend/provider/mod.rs b/crates/nvisy-rig/src/backend/provider/mod.rs index 3d89231..beaff0e 100644 --- a/crates/nvisy-rig/src/backend/provider/mod.rs +++ b/crates/nvisy-rig/src/backend/provider/mod.rs @@ -6,5 +6,5 @@ mod authenticated; mod unauthenticated; -pub use authenticated::AuthenticatedProvider; -pub use unauthenticated::UnauthenticatedProvider; +pub use self::authenticated::AuthenticatedProvider; +pub use self::unauthenticated::UnauthenticatedProvider; diff --git a/crates/nvisy-server/src/extract/mod.rs b/crates/nvisy-server/src/extract/mod.rs index 7f8faea..bc696e5 100644 --- a/crates/nvisy-server/src/extract/mod.rs +++ b/crates/nvisy-server/src/extract/mod.rs @@ -4,6 +4,6 @@ mod json; mod path; mod version; -pub use json::Json; -pub use path::Path; -pub use version::Version; +pub use self::json::Json; +pub use self::path::Path; +pub use self::version::Version; diff --git a/crates/nvisy-server/src/handler/error/mod.rs b/crates/nvisy-server/src/handler/error/mod.rs index 7d733be..0687ada 100644 --- a/crates/nvisy-server/src/handler/error/mod.rs +++ b/crates/nvisy-server/src/handler/error/mod.rs @@ -8,5 +8,5 @@ mod from_core; mod http_error; mod http_kind; -pub use http_error::{Error, Result}; -pub use http_kind::ErrorKind; +pub use self::http_error::{Error, Result}; +pub use self::http_kind::ErrorKind; diff --git a/crates/nvisy-server/src/handler/mod.rs b/crates/nvisy-server/src/handler/mod.rs index 8cb5cce..2a7cc76 100644 --- a/crates/nvisy-server/src/handler/mod.rs +++ b/crates/nvisy-server/src/handler/mod.rs @@ -21,8 +21,8 @@ mod request; mod response; use aide::axum::ApiRouter; -pub use error::{Error, ErrorKind, Result}; +pub use self::error::{Error, ErrorKind, Result}; use crate::service::ServiceState; /// Build the handler route tree. diff --git a/crates/nvisy-server/src/handler/request/mod.rs b/crates/nvisy-server/src/handler/request/mod.rs index 9a60105..d01612b 100644 --- a/crates/nvisy-server/src/handler/request/mod.rs +++ b/crates/nvisy-server/src/handler/request/mod.rs @@ -9,7 +9,7 @@ mod files; mod path; mod process; -pub use contexts::NewContext; -pub use files::NewFile; -pub use path::{ActorQuery, ContentPath, ContextPath}; -pub use process::NewProcess; +pub use self::contexts::NewContext; +pub use self::files::NewFile; +pub use self::path::{ActorQuery, ContentPath, ContextPath}; +pub use self::process::NewProcess; diff --git a/crates/nvisy-server/src/handler/response/mod.rs b/crates/nvisy-server/src/handler/response/mod.rs index b2f7fa9..6c57c58 100644 --- a/crates/nvisy-server/src/handler/response/mod.rs +++ b/crates/nvisy-server/src/handler/response/mod.rs @@ -11,8 +11,8 @@ mod error; mod files; mod process; -pub use check::{Analytics, Health, ServiceStatus}; -pub use contexts::{Context, ContextId, ContextList}; -pub use error::ErrorResponse; -pub use files::{File, FileId, FileList}; -pub use process::ProcessResult; +pub use self::check::{Analytics, Health, ServiceStatus}; +pub use self::contexts::{Context, ContextId, ContextList}; +pub use self::error::ErrorResponse; +pub use self::files::{File, FileId, FileList}; +pub use self::process::ProcessResult; diff --git a/crates/nvisy-server/src/lib.rs b/crates/nvisy-server/src/lib.rs index 8c92dde..8d6a424 100644 --- a/crates/nvisy-server/src/lib.rs +++ b/crates/nvisy-server/src/lib.rs @@ -7,6 +7,6 @@ pub mod handler; pub mod middleware; pub mod service; -pub use handler::error::{Error, ErrorKind, Result}; -pub use handler::routes; -pub use service::ServiceState; +pub use self::handler::error::{Error, ErrorKind, Result}; +pub use self::handler::routes; +pub use self::service::ServiceState; diff --git a/crates/nvisy-server/src/middleware/mod.rs b/crates/nvisy-server/src/middleware/mod.rs index c4f1e18..eb25d59 100644 --- a/crates/nvisy-server/src/middleware/mod.rs +++ b/crates/nvisy-server/src/middleware/mod.rs @@ -47,10 +47,10 @@ mod recovery; mod security; mod specification; -pub use constants::{ +pub use self::constants::{ DEFAULT_MAX_BODY_SIZE, DEFAULT_MAX_FILE_BODY_SIZE, DEFAULT_REQUEST_TIMEOUT_SECS, }; -pub use observability::RouterObservabilityExt; -pub use recovery::{RecoveryConfig, RouterRecoveryExt}; -pub use security::{RouterSecurityExt, SecurityConfig}; -pub use specification::{OpenApiConfig, RouterOpenApiExt}; +pub use self::observability::RouterObservabilityExt; +pub use self::recovery::{RecoveryConfig, RouterRecoveryExt}; +pub use self::security::{RouterSecurityExt, SecurityConfig}; +pub use self::specification::{OpenApiConfig, RouterOpenApiExt}; From 262a839270ab128b1178e12ac96f203e63deeb22 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 20:31:07 +0100 Subject: [PATCH 07/10] refactor(pattern): extract ScanContext, simplify DenyList API, and polish docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move allow/deny lists from PatternEngine into per-scan ScanContext, simplify DenyList::with/insert to take a DenyRule struct instead of separate fields, remove FromIterator for DenyList, fold builder back into its own file after slimming, add scan_context.rs. Extract registries and error types into dedicated files across dictionaries and patterns modules. Rename Error.source_component to component. Normalize all doc comments to prefer : over — and single spaces. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-core/src/error.rs | 6 +- .../src/operation/processing/pattern_match.rs | 6 +- crates/nvisy-pattern/README.md | 42 +- .../src/dictionaries/csv_dictionary.rs | 74 ++-- .../src/dictionaries/csv_error.rs | 19 + .../src/dictionaries/dictionary.rs | 32 +- .../src/dictionaries/dictionary_error.rs | 42 ++ .../src/dictionaries/dictionary_registry.rs | 326 ++++++++++++++ crates/nvisy-pattern/src/dictionaries/mod.rs | 364 +--------------- .../src/dictionaries/text_dictionary.rs | 27 +- crates/nvisy-pattern/src/engine/allow_list.rs | 2 +- crates/nvisy-pattern/src/engine/builder.rs | 48 +-- crates/nvisy-pattern/src/engine/deny_list.rs | 78 +--- crates/nvisy-pattern/src/engine/error.rs | 16 +- crates/nvisy-pattern/src/engine/mod.rs | 172 ++++---- .../nvisy-pattern/src/engine/pattern_match.rs | 7 +- .../nvisy-pattern/src/engine/scan_context.rs | 50 +++ crates/nvisy-pattern/src/lib.rs | 10 +- .../src/patterns/context_rule.rs | 8 +- .../src/patterns/json_pattern.rs | 14 +- crates/nvisy-pattern/src/patterns/mod.rs | 253 +---------- crates/nvisy-pattern/src/patterns/pattern.rs | 20 +- .../src/patterns/pattern_error.rs | 42 ++ .../src/patterns/pattern_registry.rs | 405 ++++++++++++++++++ crates/nvisy-pattern/src/prelude.rs | 3 +- crates/nvisy-pattern/src/validators/luhn.rs | 2 +- crates/nvisy-pattern/src/validators/mod.rs | 15 +- .../src/handler/error/from_core.rs | 2 +- 28 files changed, 1168 insertions(+), 917 deletions(-) create mode 100644 crates/nvisy-pattern/src/dictionaries/csv_error.rs create mode 100644 crates/nvisy-pattern/src/dictionaries/dictionary_error.rs create mode 100644 crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs create mode 100644 crates/nvisy-pattern/src/engine/scan_context.rs create mode 100644 crates/nvisy-pattern/src/patterns/pattern_error.rs create mode 100644 crates/nvisy-pattern/src/patterns/pattern_registry.rs diff --git a/crates/nvisy-core/src/error.rs b/crates/nvisy-core/src/error.rs index 063cd42..1a50448 100644 --- a/crates/nvisy-core/src/error.rs +++ b/crates/nvisy-core/src/error.rs @@ -44,7 +44,7 @@ pub struct Error { /// Human-readable description of what went wrong. pub message: String, /// Name of the component that produced this error (e.g. `"s3-read"`, `"detect-regex"`). - pub source_component: Option, + pub component: Option, /// Whether the operation that failed can be safely retried. pub retryable: bool, /// The underlying cause, if any. @@ -58,7 +58,7 @@ impl Error { Self { kind, message: message.into(), - source_component: None, + component: None, retryable: false, source: None, } @@ -72,7 +72,7 @@ impl Error { /// Tag this error with the name of the component that produced it. pub fn with_component(mut self, component: impl Into) -> Self { - self.source_component = Some(component.into()); + self.component = Some(component.into()); self } diff --git a/crates/nvisy-engine/src/operation/processing/pattern_match.rs b/crates/nvisy-engine/src/operation/processing/pattern_match.rs index 4448073..d41d745 100644 --- a/crates/nvisy-engine/src/operation/processing/pattern_match.rs +++ b/crates/nvisy-engine/src/operation/processing/pattern_match.rs @@ -7,7 +7,8 @@ use nvisy_codec::Span; use nvisy_codec::handler::TextData; use nvisy_core::{Error, Result}; use nvisy_ontology::entity::TextLocation; -use nvisy_pattern::{ContextRule, PatternEngine, PatternEngineBuilder, RawMatch}; +use nvisy_pattern::patterns::ContextRule; +use nvisy_pattern::{PatternEngine, PatternEngineBuilder, RawMatch, ScanContext}; use serde::Deserialize; use crate::operation::envelope::DetectedEntities; @@ -55,9 +56,10 @@ impl PatternMatch { let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); let mut raw_matches: Vec<(usize, RawMatch)> = Vec::new(); + let scan_ctx = ScanContext::default(); for (idx, span) in spans.iter().enumerate() { - for m in self.engine.scan_text(span.data.as_str()) { + for m in self.engine.scan_text(span.data.as_str(), &scan_ctx) { raw_matches.push((idx, m)); } } diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index 6da6ebb..139c173 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -56,15 +56,15 @@ Patterns are JSON definition files embedded at compile time from | Field | Type | Default | Description | |-------|------|---------|-------------| -| `regex` | string | — | Regular expression string | -| `validator` | string | — | Post-match validator name resolved via `ValidatorResolver` | +| `regex` | string | required | Regular expression string | +| `validator` | string | none | Post-match validator name resolved via `ValidatorResolver` | | `case_sensitive` | bool | `false` | Whether matching is case-sensitive | ### `dictionary` object (dictionary match source) | Field | Type | Default | Description | |-------|------|---------|-------------| -| `name` | string | — | Named dictionary from `DictionaryRegistry` | +| `name` | string | required | Named dictionary from `DictionaryRegistry` | | `case_sensitive` | bool | `false` | Whether matching is case-sensitive | ### Context rule (co-occurrence scoring) @@ -76,7 +76,7 @@ increased by `boost`, clamped to `[0.0, 1.0]`. | Field | Type | Default | Description | |-------|------|---------|-------------| -| `keywords` | string[] | — | Strings to search for in nearby spans | +| `keywords` | string[] | required | Strings to search for in nearby spans | | `window` | int | `3` | Number of spans before/after the match to examine | | `boost` | float | `0.1` | Confidence increase when a keyword is found | | `case_sensitive` | bool | `false` | Whether keyword matching is case-sensitive | @@ -88,21 +88,25 @@ adjacent spans. ## Allow/deny lists -The `PatternEngineBuilder` supports exact-match allow and deny lists via the -[`AllowList`] and [`DenyList`] types: +Allow and deny lists are configured per-scan via [`ScanContext`], not on the +engine itself: ```rust,ignore -let allow = AllowList::new() - .with("123-45-6789") // suppress known test SSN - .with("000-00-0000"); - -let deny = DenyList::new() - .with("John Doe", EntityCategory::PersonalIdentity, EntityKind::PersonName); - -let engine = PatternEngine::builder() - .with_allow(allow) - .with_deny(deny) - .build()?; +use nvisy_pattern::prelude::*; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; + +let ctx = ScanContext::new() + .with_allow(AllowList::new() + .with("123-45-6789") // suppress known test SSN + .with("000-00-0000")) + .with_deny(DenyList::new() + .with("John Doe", DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + })); + +let matches = PatternEngine::instance().scan_text("...", &ctx); ``` - **Allow list** (`AllowList`): matched values that appear in the allow list @@ -111,12 +115,10 @@ let engine = PatternEngine::builder() was not matched by any regex or dictionary pattern, it is injected as a synthetic `RawMatch` with confidence `1.0` and `pattern_name: None`. -Both types implement `FromIterator` for easy construction from iterators. - ## Validators Validators are post-match checks resolved by name through `ValidatorResolver`. -Regex patterns reference a validator by name in their `pattern.validator` field; +Regex patterns reference a validator by name in their `pattern.validator` field: the engine runs the validator on each raw match and drops values that fail. ## Documentation diff --git a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs index 1958ad8..e2baa97 100644 --- a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs @@ -1,27 +1,20 @@ -//! CSV dictionary: one row per entity, each cell is a matchable variant. +//! CSV dictionary: one row per entity, each cell becomes a matchable variant. use std::path::Path; -use super::Dictionary; - -/// Error returned when a CSV dictionary cannot be parsed. -#[derive(Debug, thiserror::Error)] -#[error("failed to parse CSV record in dictionary '{name}': {source}")] -pub struct CsvDictionaryError { - name: String, - source: csv::Error, -} +use super::{CsvDictionaryError, Dictionary, DictionaryLoadError, DictionaryTerm}; /// A dictionary parsed from a CSV file. /// /// Each row may contain multiple columns (e.g. name, symbol, code). -/// Every non-empty cell becomes a matchable term. -#[derive(Debug, Clone)] +/// Every non-empty cell becomes a matchable term whose [`column`] +/// records which CSV column it came from. +/// +/// [`column`]: DictionaryTerm::column +#[derive(Debug)] pub struct CsvDictionary { name: String, - entries: Vec, - /// Source column index for each entry (parallel to `entries`). - columns: Vec, + terms: Vec, } impl CsvDictionary { @@ -38,8 +31,7 @@ impl CsvDictionary { pub fn new(name: impl Into, text: &str) -> Result { let name = name.into(); - let mut entries = Vec::new(); - let mut columns = Vec::new(); + let mut terms = Vec::new(); let mut reader = csv::ReaderBuilder::new() .has_headers(false) .flexible(true) @@ -53,17 +45,15 @@ impl CsvDictionary { })?; for (col, field) in record.iter().enumerate() { if !field.is_empty() { - entries.push(field.to_owned()); - columns.push(col); + terms.push(DictionaryTerm { + value: field.to_owned(), + column: Some(col as u32), + }); } } } - Ok(Self { - name, - entries, - columns, - }) + Ok(Self { name, terms }) } /// Load a CSV dictionary from a file path. @@ -72,21 +62,20 @@ impl CsvDictionary { /// /// # Errors /// - /// Returns [`DictionaryLoadError`](super::DictionaryLoadError) if the - /// file cannot be read or the CSV content cannot be parsed. - pub fn from_path(path: impl AsRef) -> Result { + /// Returns [`DictionaryLoadError`] if the file cannot be read or + /// the CSV content cannot be parsed. + pub fn from_path(path: impl AsRef) -> Result { let path = path.as_ref(); let name = path .file_stem() .and_then(|s| s.to_str()) .unwrap_or_default(); - let text = std::fs::read_to_string(path).map_err(|source| { - super::DictionaryLoadError::ReadFile { + let text = + std::fs::read_to_string(path).map_err(|source| DictionaryLoadError::ReadFile { path: path.to_owned(), source, - } - })?; - Self::new(name, &text).map_err(|source| super::DictionaryLoadError::CsvParse { + })?; + Self::new(name, &text).map_err(|source| DictionaryLoadError::CsvParse { path: path.to_owned(), source, }) @@ -98,12 +87,8 @@ impl Dictionary for CsvDictionary { &self.name } - fn entries(&self) -> &[String] { - &self.entries - } - - fn columns(&self) -> Option<&[usize]> { - Some(&self.columns) + fn terms(&self) -> &[DictionaryTerm] { + &self.terms } } @@ -115,24 +100,29 @@ mod tests { fn parses_rows_with_variants() { let dict = CsvDictionary::new("test", "US Dollar,USD\nEuro,EUR\n").unwrap(); assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["US Dollar", "USD", "Euro", "EUR"]); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["US Dollar", "USD", "Euro", "EUR"]); } #[test] fn handles_variable_columns() { let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); - assert_eq!(dict.entries(), &["a", "b", "c", "d", "e"]); + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["a", "b", "c", "d", "e"]); } #[test] fn skips_empty_fields() { let dict = CsvDictionary::new("test", "a,,b\n").unwrap(); - assert_eq!(dict.entries(), &["a", "b"]); + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["a", "b"]); } #[test] fn column_indices_are_tracked() { let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); - assert_eq!(dict.columns(), Some([0, 1, 2, 0, 1].as_slice())); + let columns: Vec> = dict.terms().iter().map(|t| t.column).collect(); + assert_eq!(columns, &[Some(0), Some(1), Some(2), Some(0), Some(1)]); } } diff --git a/crates/nvisy-pattern/src/dictionaries/csv_error.rs b/crates/nvisy-pattern/src/dictionaries/csv_error.rs new file mode 100644 index 0000000..9db84f9 --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/csv_error.rs @@ -0,0 +1,19 @@ +//! Error type for CSV dictionary parsing. + +use nvisy_core::{Error, ErrorKind}; + +/// Error returned when a CSV dictionary cannot be parsed. +#[derive(Debug, thiserror::Error)] +#[error("failed to parse CSV record in dictionary '{name}': {source}")] +pub struct CsvDictionaryError { + pub(crate) name: String, + pub(crate) source: csv::Error, +} + +impl From for Error { + fn from(err: CsvDictionaryError) -> Self { + Error::new(ErrorKind::Validation, err.to_string()) + .with_component("nvisy-pattern::dictionaries") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary.rs b/crates/nvisy-pattern/src/dictionaries/dictionary.rs index 8edc63f..e32f882 100644 --- a/crates/nvisy-pattern/src/dictionaries/dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/dictionary.rs @@ -1,12 +1,27 @@ -//! Core [`Dictionary`] trait and [`BoxDictionary`] alias. +//! Core [`Dictionary`] trait, [`DictionaryTerm`], and [`BoxDictionary`] type alias. + +/// A single matchable term within a [`Dictionary`]. +/// +/// Each term carries its matched value and, for multi-column sources like +/// CSV files, the column index it originated from. Plain-text dictionaries +/// leave `column` as `None` (logically equivalent to column 0). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DictionaryTerm { + /// The matchable text value. + pub value: String, + /// Source column index for CSV dictionaries. + /// + /// `None` for plain-text dictionaries where column position is + /// not meaningful. + pub column: Option, +} /// A named collection of matchable terms (e.g. nationalities, currencies). /// -/// Implementors load their entries from an asset file at compile time. /// Two built-in implementations are provided: /// /// - [`TxtDictionary`]: plain text, one entry per line. -/// - [`CsvDictionary`]: CSV, each cell is a term. +/// - [`CsvDictionary`]: CSV, each cell is a term with its column index. /// /// [`TxtDictionary`]: super::TxtDictionary /// [`CsvDictionary`]: super::CsvDictionary @@ -15,16 +30,7 @@ pub trait Dictionary: Send + Sync { fn name(&self) -> &str; /// All matchable terms produced by this dictionary. - fn entries(&self) -> &[String]; - - /// Column index for each entry, parallel to [`entries`](Self::entries). - /// - /// Returns `Some` for CSV dictionaries where each cell tracks its - /// source column. Returns `None` for plain-text dictionaries (all - /// entries are logically in column 0). - fn columns(&self) -> Option<&[usize]> { - None - } + fn terms(&self) -> &[DictionaryTerm]; } /// Type-erased boxed [`Dictionary`]. diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs b/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs new file mode 100644 index 0000000..dc34f40 --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs @@ -0,0 +1,42 @@ +//! Error type for dictionary filesystem loading. + +use nvisy_core::{Error, ErrorKind}; + +use super::CsvDictionaryError; + +/// Error returned when loading dictionaries from the filesystem. +#[derive(Debug, thiserror::Error)] +pub enum DictionaryLoadError { + /// The directory could not be read. + #[error("failed to read dictionary directory '{}': {source}", path.display())] + ReadDir { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A dictionary file could not be read. + #[error("failed to read dictionary file '{}': {source}", path.display())] + ReadFile { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A CSV dictionary file failed to parse. + #[error("failed to parse CSV dictionary '{}': {source}", path.display())] + CsvParse { + path: std::path::PathBuf, + source: CsvDictionaryError, + }, +} + +impl From for Error { + fn from(err: DictionaryLoadError) -> Self { + let kind = match &err { + DictionaryLoadError::ReadDir { .. } | DictionaryLoadError::ReadFile { .. } => { + ErrorKind::Internal + } + DictionaryLoadError::CsvParse { .. } => ErrorKind::Validation, + }; + Error::new(kind, err.to_string()) + .with_component("nvisy-pattern::dictionaries") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs new file mode 100644 index 0000000..27c5318 --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs @@ -0,0 +1,326 @@ +//! [`DictionaryRegistry`]: named dictionary collection with O(log n) lookup. + +use std::collections::BTreeMap; +use std::path::Path; +use std::sync::LazyLock; + +use include_dir::{Dir, include_dir}; + +use super::{BoxDictionary, CsvDictionary, Dictionary, DictionaryLoadError, TxtDictionary}; + +const TARGET: &str = "nvisy_pattern::dictionaries"; + +/// A registry of named [`Dictionary`] instances with O(log n) lookup. +/// +/// Use [`load_builtins`] to create a registry pre-populated with +/// the compile-time-embedded dictionary files, or [`load_dir`] to +/// load from a filesystem directory at runtime. +/// +/// [`load_builtins`]: Self::load_builtins +/// [`load_dir`]: Self::load_dir +#[derive(Default)] +pub struct DictionaryRegistry { + inner: BTreeMap, +} + +impl std::fmt::Debug for DictionaryRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); + f.debug_struct("DictionaryRegistry") + .field("len", &self.inner.len()) + .field("names", &names) + .finish() + } +} + +impl DictionaryRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self::default() + } + + /// Insert a dictionary, keyed by its [`Dictionary::name`]. + pub fn insert(&mut self, dict: BoxDictionary) { + let name = dict.name().to_owned(); + self.inner.insert(name, dict); + } + + /// Look up a dictionary by name. + #[must_use] + pub fn get(&self, name: &str) -> Option<&dyn Dictionary> { + self.inner.get(name).map(|b| b.as_ref()) + } + + /// Iterate over all registered dictionaries as `(name, &dyn Dictionary)` pairs. + pub fn iter(&self) -> impl Iterator { + self.inner.iter().map(|(k, v)| (k.as_str(), v.as_ref())) + } + + /// Iterate over all registered dictionary names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) + } + + /// Total number of registered dictionaries. + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Whether the registry contains no dictionaries. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Load all `.txt` and `.csv` files from the embedded + /// `assets/dictionaries/` directory into this registry. + /// + /// Unrecognised file extensions are logged as warnings and skipped. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_builtins", skip(self), fields(count))] + pub fn load_builtins(&mut self) { + static DICT_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/dictionaries"); + + for file in DICT_DIR.files() { + let path = file.path(); + let text = file + .contents_utf8() + .expect("dictionary file is not valid UTF-8"); + + let name = path + .file_stem() + .expect("dictionary path has no file stem") + .to_string_lossy(); + + let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { + Some("txt") => Box::new(TxtDictionary::new(name.as_ref(), text)), + Some("csv") => Box::new( + CsvDictionary::new(name.as_ref(), text) + .expect("built-in CSV dictionary must parse"), + ), + other => { + tracing::warn!( + target: TARGET, + path = %path.display(), + extension = ?other, + "skipping unrecognised dictionary file", + ); + continue; + } + }; + + tracing::trace!( + target: TARGET, + name = dict.name(), + terms = dict.terms().len(), + "dictionary loaded", + ); + self.insert(dict); + } + + tracing::Span::current().record("count", self.len()); + tracing::debug!(target: TARGET, "built-in dictionaries loaded"); + } + + /// Load a single `.txt` or `.csv` dictionary file and insert it. + /// + /// The dictionary name is derived from the file stem. + /// Files with unrecognised extensions are logged as warnings and + /// ignored (no error is returned). + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the file cannot be read or + /// a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_file", skip_all, fields(path = %path.as_ref().display()))] + pub fn load_file(&mut self, path: impl AsRef) -> nvisy_core::Result<()> { + let path = path.as_ref(); + + let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { + Some("txt") => { + let d = TxtDictionary::from_path(path).map_err(|source| { + DictionaryLoadError::ReadFile { + path: path.to_owned(), + source, + } + })?; + Box::new(d) + } + Some("csv") => Box::new(CsvDictionary::from_path(path)?), + other => { + tracing::warn!( + target: TARGET, + path = %path.display(), + extension = ?other, + "skipping unrecognised dictionary file", + ); + return Ok(()); + } + }; + + tracing::trace!( + target: TARGET, + name = dict.name(), + terms = dict.terms().len(), + "dictionary loaded from filesystem", + ); + self.insert(dict); + Ok(()) + } + + /// Load all `.txt` and `.csv` files from a filesystem directory. + /// + /// Files with unrecognised extensions are logged as warnings and + /// skipped. Loaded dictionaries are inserted into `self`, so this + /// can be called after [`load_builtins`](Self::load_builtins) to + /// layer user-provided dictionaries on top of the built-ins. + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the directory cannot be read, + /// a file cannot be read, or a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] + pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { + let dir = dir.as_ref(); + + let entries = std::fs::read_dir(dir).map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + + let mut count = 0usize; + for entry in entries { + let entry = entry.map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + let path = entry.path(); + + if !path.is_file() { + continue; + } + + self.load_file(&path)?; + count += 1; + } + + tracing::Span::current().record("count", count); + tracing::debug!(target: TARGET, "filesystem dictionaries loaded"); + Ok(()) + } +} + +static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { + let mut reg = DictionaryRegistry::new(); + reg.load_builtins(); + reg +}); + +/// Return a reference to the lazily-initialised built-in [`DictionaryRegistry`]. +pub fn builtin_registry() -> &'static DictionaryRegistry { + &BUILTIN_REGISTRY +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::*; + + fn registry() -> &'static DictionaryRegistry { + builtin_registry() + } + + #[test] + fn builtins_load_and_are_nonempty() { + let reg = registry(); + assert!(!reg.is_empty()); + for (_, dict) in reg.iter() { + assert!(!dict.terms().is_empty(), "{} is empty", dict.name()); + } + } + + #[test] + fn terms_are_trimmed_and_nonempty() { + for (_, dict) in registry().iter() { + let name = dict.name(); + for term in dict.terms() { + assert!(!term.value.is_empty(), "empty term in {name}"); + assert_eq!( + term.value, + term.value.trim(), + "untrimmed term in {name}: {:?}", + term.value, + ); + } + } + } + + #[test] + fn no_duplicate_terms_per_dictionary() { + for (_, dict) in registry().iter() { + let mut seen = HashSet::new(); + for term in dict.terms() { + assert!( + seen.insert(term.value.as_str()), + "duplicate term {:?} in dictionary {}", + term.value, + dict.name(), + ); + } + } + } + + #[test] + fn registry_names_are_sorted() { + let keys: Vec<&str> = registry().names().collect(); + let mut sorted = keys.clone(); + sorted.sort(); + assert_eq!(keys, sorted); + } + + #[test] + fn registry_insert_and_get() { + let mut reg = DictionaryRegistry::new(); + let dict: BoxDictionary = Box::new(TxtDictionary::new("test", "foo\nbar\n")); + reg.insert(dict); + + assert_eq!(reg.len(), 1); + + let dict = reg.get("test").unwrap(); + assert_eq!(dict.name(), "test"); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["foo", "bar"]); + } + + #[test] + fn load_dir_reads_filesystem() { + let dir = tempfile::tempdir().unwrap(); + + std::fs::write(dir.path().join("colors.txt"), "red\nblue\ngreen\n").unwrap(); + std::fs::write(dir.path().join("sizes.csv"), "small,S\nmedium,M\nlarge,L\n").unwrap(); + // Should be skipped. + std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); + + let mut reg = DictionaryRegistry::new(); + reg.load_dir(dir.path()).unwrap(); + + assert_eq!(reg.len(), 2); + + let colors = reg.get("colors").unwrap(); + let color_values: Vec<&str> = colors.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(color_values, &["red", "blue", "green"]); + + let sizes = reg.get("sizes").unwrap(); + let size_values: Vec<&str> = sizes.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(size_values, &["small", "S", "medium", "M", "large", "L"]); + } + + #[test] + fn load_dir_missing_directory() { + let mut reg = DictionaryRegistry::new(); + let result = reg.load_dir("/nonexistent/path"); + assert!(result.is_err()); + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/mod.rs b/crates/nvisy-pattern/src/dictionaries/mod.rs index 2d94883..176843c 100644 --- a/crates/nvisy-pattern/src/dictionaries/mod.rs +++ b/crates/nvisy-pattern/src/dictionaries/mod.rs @@ -1,19 +1,19 @@ //! Built-in dictionaries for entity matching. //! //! Dictionaries are asset files under `assets/dictionaries/` containing -//! matchable terms (nationalities, religions, currencies, etc.). They are +//! matchable terms (nationalities, religions, currencies, etc.). They are //! embedded at compile time and loaded lazily on first access. //! //! Two file formats are supported: //! //! - **Plain text** (`.txt`): one entry per line, see [`TxtDictionary`]. -//! - **CSV** (`.csv`): each row holds variants of a single entity -//! (e.g. `US Dollar,USD`), see [`CsvDictionary`]. +//! - **CSV** (`.csv`): each row holds variants of a single entity (e.g. +//! `US Dollar,USD`), see [`CsvDictionary`]. //! //! # Key types //! //! - [`Dictionary`]: trait implemented by every dictionary. -//! - [`DictionaryRegistry`]: sorted collection with O(log n) lookup by name. +//! - [`DictionaryRegistry`]: sorted collection with O(log n) lookup. //! //! [`TxtDictionary`]: crate::dictionaries::TxtDictionary //! [`CsvDictionary`]: crate::dictionaries::CsvDictionary @@ -21,355 +21,15 @@ //! [`DictionaryRegistry`]: crate::dictionaries::DictionaryRegistry mod csv_dictionary; +mod csv_error; mod dictionary; +mod dictionary_error; +mod dictionary_registry; mod text_dictionary; -use std::collections::BTreeMap; -use std::path::Path; -use std::sync::LazyLock; - -use include_dir::{Dir, include_dir}; - -pub use self::csv_dictionary::{CsvDictionary, CsvDictionaryError}; -pub use self::dictionary::{BoxDictionary, Dictionary}; +pub use self::csv_dictionary::CsvDictionary; +pub(crate) use self::csv_error::CsvDictionaryError; +pub use self::dictionary::{BoxDictionary, Dictionary, DictionaryTerm}; +pub(crate) use self::dictionary_error::DictionaryLoadError; +pub use self::dictionary_registry::{DictionaryRegistry, builtin_registry}; pub use self::text_dictionary::TxtDictionary; - -const TARGET: &str = "nvisy_pattern::dictionaries"; - -/// Error returned when loading dictionaries from a filesystem directory. -#[derive(Debug, thiserror::Error)] -pub enum DictionaryLoadError { - /// The directory could not be read. - #[error("failed to read dictionary directory '{}': {source}", path.display())] - ReadDir { - path: std::path::PathBuf, - source: std::io::Error, - }, - /// A dictionary file could not be read. - #[error("failed to read dictionary file '{}': {source}", path.display())] - ReadFile { - path: std::path::PathBuf, - source: std::io::Error, - }, - /// A CSV dictionary file failed to parse. - #[error("failed to parse CSV dictionary '{}': {source}", path.display())] - CsvParse { - path: std::path::PathBuf, - source: CsvDictionaryError, - }, -} - -/// A registry of named [`Dictionary`] instances with O(log n) lookup. -/// -/// Use [`load_builtins`] to create a registry pre-populated with -/// the compile-time-embedded dictionary files, or [`load_dir`] to -/// load from a filesystem directory at runtime. -/// -/// [`load_builtins`]: Self::load_builtins -/// [`load_dir`]: Self::load_dir -pub struct DictionaryRegistry { - inner: BTreeMap, -} - -impl std::fmt::Debug for DictionaryRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); - f.debug_struct("DictionaryRegistry") - .field("len", &self.inner.len()) - .field("names", &names) - .finish() - } -} - -impl DictionaryRegistry { - /// Create an empty registry. - pub fn new() -> Self { - Self::default() - } - - /// Insert a dictionary, keyed by its [`Dictionary::name`]. - pub fn insert(&mut self, dict: BoxDictionary) { - let name = dict.name().to_owned(); - self.inner.insert(name, dict); - } - - /// Look up a dictionary by name. - #[must_use] - pub fn get(&self, name: &str) -> Option<&dyn Dictionary> { - self.inner.get(name).map(|b| b.as_ref()) - } - - /// Iterate over all registered dictionaries as `(name, &dyn Dictionary)` pairs. - pub fn iter(&self) -> impl Iterator { - self.inner.iter().map(|(k, v)| (k.as_str(), v.as_ref())) - } - - /// Iterate over all registered dictionary names. - pub fn names(&self) -> impl Iterator { - self.inner.keys().map(|s| s.as_str()) - } - - /// Total number of registered dictionaries. - #[must_use] - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Whether the registry contains no dictionaries. - #[must_use] - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - /// Load all `.txt` and `.csv` files from the embedded - /// `assets/dictionaries/` directory into this registry. - /// - /// Unrecognised file extensions are logged as warnings and skipped. - #[tracing::instrument(target = TARGET, name = "dictionaries.load_builtins", skip(self), fields(count))] - pub fn load_builtins(&mut self) { - static DICT_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/dictionaries"); - - for file in DICT_DIR.files() { - let path = file.path(); - let text = file - .contents_utf8() - .expect("dictionary file is not valid UTF-8"); - - let name = path - .file_stem() - .expect("dictionary path has no file stem") - .to_string_lossy(); - - let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { - Some("txt") => Box::new(TxtDictionary::new(name.as_ref(), text)), - Some("csv") => Box::new( - CsvDictionary::new(name.as_ref(), text) - .expect("built-in CSV dictionary must parse"), - ), - other => { - tracing::warn!( - target: TARGET, - path = %path.display(), - extension = ?other, - "skipping unrecognised dictionary file", - ); - continue; - } - }; - - tracing::trace!( - target: TARGET, - name = dict.name(), - entries = dict.entries().len(), - "dictionary loaded", - ); - self.insert(dict); - } - - tracing::Span::current().record("count", self.len()); - tracing::debug!(target: TARGET, "built-in dictionaries loaded"); - } - - /// Load a single `.txt` or `.csv` dictionary file and insert it. - /// - /// The dictionary name is derived from the file stem. - /// Files with unrecognised extensions are logged as warnings and - /// ignored (no error is returned). - /// - /// # Errors - /// - /// Returns [`DictionaryLoadError`] if the file cannot be read or - /// a CSV file fails to parse. - #[tracing::instrument(target = TARGET, name = "dictionaries.load_file", skip_all, fields(path = %path.as_ref().display()))] - pub fn load_file(&mut self, path: impl AsRef) -> Result<(), DictionaryLoadError> { - let path = path.as_ref(); - - let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { - Some("txt") => { - let d = TxtDictionary::from_path(path).map_err(|source| { - DictionaryLoadError::ReadFile { - path: path.to_owned(), - source, - } - })?; - Box::new(d) - } - Some("csv") => Box::new(CsvDictionary::from_path(path)?), - other => { - tracing::warn!( - target: TARGET, - path = %path.display(), - extension = ?other, - "skipping unrecognised dictionary file", - ); - return Ok(()); - } - }; - - tracing::trace!( - target: TARGET, - name = dict.name(), - entries = dict.entries().len(), - "dictionary loaded from filesystem", - ); - self.insert(dict); - Ok(()) - } - - /// Load all `.txt` and `.csv` files from a filesystem directory. - /// - /// Files with unrecognised extensions are logged as warnings and - /// skipped. Loaded dictionaries are inserted into `self`, so this - /// can be called after [`load_builtins`](Self::load_builtins) to - /// layer user-provided dictionaries on top of the built-ins. - /// - /// # Errors - /// - /// Returns [`DictionaryLoadError`] if the directory cannot be read, - /// a file cannot be read, or a CSV file fails to parse. - #[tracing::instrument(target = TARGET, name = "dictionaries.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] - pub fn load_dir(&mut self, dir: impl AsRef) -> Result<(), DictionaryLoadError> { - let dir = dir.as_ref(); - - let entries = std::fs::read_dir(dir).map_err(|source| DictionaryLoadError::ReadDir { - path: dir.to_owned(), - source, - })?; - - let mut count = 0usize; - for entry in entries { - let entry = entry.map_err(|source| DictionaryLoadError::ReadDir { - path: dir.to_owned(), - source, - })?; - let path = entry.path(); - - if !path.is_file() { - continue; - } - - self.load_file(&path)?; - count += 1; - } - - tracing::Span::current().record("count", count); - tracing::debug!(target: TARGET, "filesystem dictionaries loaded"); - Ok(()) - } -} - -impl Default for DictionaryRegistry { - fn default() -> Self { - Self { - inner: BTreeMap::new(), - } - } -} - -static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { - let mut reg = DictionaryRegistry::new(); - reg.load_builtins(); - reg -}); - -/// Return a reference to the lazily-initialised built-in [`DictionaryRegistry`]. -pub fn builtin_registry() -> &'static DictionaryRegistry { - &BUILTIN_REGISTRY -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - - use super::*; - - fn registry() -> &'static DictionaryRegistry { - builtin_registry() - } - - #[test] - fn builtins_load_and_are_nonempty() { - let reg = registry(); - assert!(!reg.is_empty()); - for (_, dict) in reg.iter() { - assert!(!dict.entries().is_empty(), "{} is empty", dict.name()); - } - } - - #[test] - fn entries_are_trimmed_and_nonempty() { - for (_, dict) in registry().iter() { - let name = dict.name(); - for entry in dict.entries() { - assert!(!entry.is_empty(), "empty entry in {name}"); - assert_eq!(*entry, entry.trim(), "untrimmed entry in {name}: {entry:?}"); - } - } - } - - #[test] - fn no_duplicate_entries_per_dictionary() { - for (_, dict) in registry().iter() { - let mut seen = HashSet::new(); - for entry in dict.entries() { - assert!( - seen.insert(entry.as_str()), - "duplicate entry {entry:?} in dictionary {}", - dict.name(), - ); - } - } - } - - #[test] - fn registry_names_are_sorted() { - let keys: Vec<&str> = registry().names().collect(); - let mut sorted = keys.clone(); - sorted.sort(); - assert_eq!(keys, sorted); - } - - #[test] - fn registry_insert_and_get() { - let mut reg = DictionaryRegistry::new(); - let dict: BoxDictionary = Box::new(TxtDictionary::new("test", "foo\nbar\n")); - reg.insert(dict); - - assert_eq!(reg.len(), 1); - - let dict = reg.get("test").unwrap(); - assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["foo", "bar"]); - } - - #[test] - fn load_dir_reads_filesystem() { - let dir = tempfile::tempdir().unwrap(); - - std::fs::write(dir.path().join("colors.txt"), "red\nblue\ngreen\n").unwrap(); - std::fs::write(dir.path().join("sizes.csv"), "small,S\nmedium,M\nlarge,L\n").unwrap(); - // Should be skipped. - std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); - - let mut reg = DictionaryRegistry::new(); - reg.load_dir(dir.path()).unwrap(); - - assert_eq!(reg.len(), 2); - - let colors = reg.get("colors").unwrap(); - assert_eq!(colors.entries(), &["red", "blue", "green"]); - - let sizes = reg.get("sizes").unwrap(); - assert_eq!( - sizes.entries(), - &["small", "S", "medium", "M", "large", "L"] - ); - } - - #[test] - fn load_dir_missing_directory() { - let mut reg = DictionaryRegistry::new(); - let result = reg.load_dir("/nonexistent/path"); - assert!(result.is_err()); - } -} diff --git a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs index d4519cb..4916aec 100644 --- a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs @@ -1,14 +1,14 @@ -//! Plain-text dictionary: one matchable entry per line. +//! Plain-text dictionary: one entry per line. use std::path::Path; -use super::Dictionary; +use super::{Dictionary, DictionaryTerm}; /// A dictionary parsed from a plain-text file (one entry per line). -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct TxtDictionary { name: String, - entries: Vec, + terms: Vec, } impl TxtDictionary { @@ -19,14 +19,17 @@ impl TxtDictionary { pub fn new(name: impl Into, text: &str) -> Self { let name = name.into(); - let entries = text + let terms = text .lines() .map(|l| l.trim()) .filter(|l| !l.is_empty()) - .map(String::from) + .map(|l| DictionaryTerm { + value: l.to_owned(), + column: None, + }) .collect(); - Self { name, entries } + Self { name, terms } } /// Load a plain-text dictionary from a file path. @@ -52,8 +55,8 @@ impl Dictionary for TxtDictionary { &self.name } - fn entries(&self) -> &[String] { - &self.entries + fn terms(&self) -> &[DictionaryTerm] { + &self.terms } } @@ -65,6 +68,10 @@ mod tests { fn parses_lines() { let dict = TxtDictionary::new("test", "alpha\n beta \n\ngamma\n"); assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["alpha", "beta", "gamma"]); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["alpha", "beta", "gamma"]); + + assert!(dict.terms().iter().all(|t| t.column.is_none())); } } diff --git a/crates/nvisy-pattern/src/engine/allow_list.rs b/crates/nvisy-pattern/src/engine/allow_list.rs index 24ee8b9..f08b6cc 100644 --- a/crates/nvisy-pattern/src/engine/allow_list.rs +++ b/crates/nvisy-pattern/src/engine/allow_list.rs @@ -1,4 +1,4 @@ -//! [`AllowList`] — exact-match suppression of known false positives. +//! [`AllowList`]: exact-match suppression of known false positives. use std::collections::HashSet; diff --git a/crates/nvisy-pattern/src/engine/builder.rs b/crates/nvisy-pattern/src/engine/builder.rs index 581adb7..acb56a3 100644 --- a/crates/nvisy-pattern/src/engine/builder.rs +++ b/crates/nvisy-pattern/src/engine/builder.rs @@ -1,17 +1,13 @@ -//! [`PatternEngineBuilder`] — configures and compiles a [`PatternEngine`]. +//! [`PatternEngineBuilder`]: configures and compiles a [`PatternEngine`]. use regex::{Regex, RegexSet}; -use super::allow_list::AllowList; -use super::deny_list::DenyList; use super::error::PatternEngineError; -use super::{DictEntry, PatternEngine, RegexEntry}; +use super::{DictEntry, PatternEngine, RegexEntry, TARGET}; use crate::dictionaries; -use crate::patterns::{self, MatchSource, Pattern}; +use crate::patterns::{MatchSource, Pattern}; use crate::validators::ValidatorResolver; -const TARGET: &str = "nvisy_pattern::engine"; - /// Builder for [`PatternEngine`]. /// /// By default all built-in patterns are included. Use @@ -20,8 +16,6 @@ const TARGET: &str = "nvisy_pattern::engine"; pub struct PatternEngineBuilder { pattern_names: Option>, confidence_threshold: f64, - allow_list: AllowList, - deny_list: DenyList, } impl PatternEngineBuilder { @@ -39,41 +33,22 @@ impl PatternEngineBuilder { /// Set the minimum confidence score for matches. /// /// Matches with confidence below this value are discarded during - /// [`scan_text`](PatternEngine::scan_text). Defaults to `0.0`. + /// [`scan_text`](PatternEngine::scan_text). Defaults to `0.0`. pub fn with_confidence_threshold(mut self, threshold: f64) -> Self { self.confidence_threshold = threshold; self } - /// Set the allow list. - /// - /// Matches whose exact value appears in the allow list are suppressed - /// (dropped) during [`scan_text`](PatternEngine::scan_text). - pub fn with_allow(mut self, list: AllowList) -> Self { - self.allow_list = list; - self - } - - /// Set the deny list. - /// - /// If a deny-list value is found in the scanned text but was not matched - /// by any regex or dictionary pattern, it is injected as a synthetic match - /// with confidence `1.0`. - pub fn with_deny(mut self, list: DenyList) -> Self { - self.deny_list = list; - self - } - /// Compile all selected patterns and build the engine. /// /// # Errors /// - /// Returns [`PatternEngineError`] if a regex fails to compile, a + /// Returns [`nvisy_core::Error`] if a regex fails to compile, a /// referenced dictionary is missing, or the Aho-Corasick automaton /// cannot be built. #[tracing::instrument(target = TARGET, name = "PatternEngine::build", skip(self))] - pub fn build(self) -> Result { - let pat_reg = patterns::builtin_registry(); + pub fn build(self) -> nvisy_core::Result { + let pat_reg = crate::patterns::builtin_registry(); let dict_reg = dictionaries::builtin_registry(); let active: Vec<&dyn Pattern> = match &self.pattern_names { @@ -112,11 +87,12 @@ impl PatternEngineBuilder { dictionary: dp.name.clone(), } })?; - let values: Vec = dict.entries().to_vec(); - if values.is_empty() { + let terms = dict.terms(); + if terms.is_empty() { continue; } - let columns = dict.columns().map(|c| c.to_vec()); + let values: Vec = terms.iter().map(|t| t.value.clone()).collect(); + let columns: Vec> = terms.iter().map(|t| t.column).collect(); let automaton = aho_corasick::AhoCorasickBuilder::new() .ascii_case_insensitive(!dp.case_sensitive) .build(&values) @@ -155,8 +131,6 @@ impl PatternEngineBuilder { dict_entries, validators, confidence_threshold: self.confidence_threshold, - allow_set: self.allow_list, - deny_set: self.deny_list, }) } } diff --git a/crates/nvisy-pattern/src/engine/deny_list.rs b/crates/nvisy-pattern/src/engine/deny_list.rs index 620c4ca..9fbb2f3 100644 --- a/crates/nvisy-pattern/src/engine/deny_list.rs +++ b/crates/nvisy-pattern/src/engine/deny_list.rs @@ -1,4 +1,4 @@ -//! [`DenyList`] — forced detection of known sensitive values. +//! [`DenyList`]: forced detection of known sensitive values. use std::collections::BTreeMap; @@ -11,7 +11,7 @@ pub struct DenyRule { pub category: EntityCategory, /// Entity kind for the injected match. pub entity_kind: EntityKind, - /// Recognition method to assign to injected matches. + /// Recognition method carried from the original detection source. pub method: RecognitionMethod, } @@ -25,11 +25,14 @@ pub struct DenyRule { /// # Examples /// /// ```rust,ignore -/// use nvisy_ontology::entity::{EntityCategory, EntityKind}; +/// use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; /// /// let deny = DenyList::new() -/// .with("John Doe", EntityCategory::PersonalIdentity, EntityKind::PersonName) -/// .with("ACME Corp", EntityCategory::PersonalIdentity, EntityKind::OrganizationName); +/// .with("John Doe", DenyRule { +/// category: EntityCategory::PersonalIdentity, +/// entity_kind: EntityKind::PersonName, +/// method: RecognitionMethod::Ner, +/// }); /// ``` #[derive(Debug, Clone, Default)] pub struct DenyList { @@ -42,58 +45,15 @@ impl DenyList { Self::default() } - /// Add a single rule with `RecognitionMethod::Dictionary` as the default method. - pub fn with( - mut self, - value: impl Into, - category: EntityCategory, - entity_kind: EntityKind, - ) -> Self { - self.entries.insert( - value.into(), - DenyRule { - category, - entity_kind, - method: RecognitionMethod::Dictionary, - }, - ); + /// Add a single rule (builder style). + pub fn with(mut self, value: impl Into, rule: DenyRule) -> Self { + self.entries.insert(value.into(), rule); self } - /// Add a single rule with an explicit recognition method. - pub fn with_method( - mut self, - value: impl Into, - category: EntityCategory, - entity_kind: EntityKind, - method: RecognitionMethod, - ) -> Self { - self.entries.insert( - value.into(), - DenyRule { - category, - entity_kind, - method, - }, - ); - self - } - - /// Insert a rule into this list with `RecognitionMethod::Dictionary` as the default method. - pub fn insert( - &mut self, - value: impl Into, - category: EntityCategory, - entity_kind: EntityKind, - ) { - self.entries.insert( - value.into(), - DenyRule { - category, - entity_kind, - method: RecognitionMethod::Dictionary, - }, - ); + /// Insert a rule into this list. + pub fn insert(&mut self, value: impl Into, rule: DenyRule) { + self.entries.insert(value.into(), rule); } /// Whether the list contains the given value. @@ -125,13 +85,3 @@ impl DenyList { self.entries.iter().map(|(k, v)| (k.as_str(), v)) } } - -impl> FromIterator<(S, EntityCategory, EntityKind)> for DenyList { - fn from_iter>(iter: I) -> Self { - let mut list = Self::new(); - for (value, category, entity_kind) in iter { - list.insert(value, category, entity_kind); - } - list - } -} diff --git a/crates/nvisy-pattern/src/engine/error.rs b/crates/nvisy-pattern/src/engine/error.rs index 4de9a38..337a18b 100644 --- a/crates/nvisy-pattern/src/engine/error.rs +++ b/crates/nvisy-pattern/src/engine/error.rs @@ -1,8 +1,12 @@ -//! Errors produced during [`PatternEngine`](super::PatternEngine) construction. +//! Errors produced during [`PatternEngine`] construction. +//! +//! [`PatternEngine`]: super::PatternEngine + +use nvisy_core::{Error, ErrorKind}; /// Errors that can occur while building a [`PatternEngine`](super::PatternEngine). #[derive(Debug, thiserror::Error)] -pub enum PatternEngineError { +pub(crate) enum PatternEngineError { /// A regex pattern string failed to compile. #[error("failed to compile regex for pattern '{name}': {source}")] RegexCompile { name: String, source: regex::Error }, @@ -19,3 +23,11 @@ pub enum PatternEngineError { #[error("failed to build RegexSet pre-filter: {0}")] RegexSetBuild(regex::Error), } + +impl From for Error { + fn from(err: PatternEngineError) -> Self { + Error::new(ErrorKind::Validation, err.to_string()) + .with_component("nvisy-pattern::engine") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/engine/mod.rs b/crates/nvisy-pattern/src/engine/mod.rs index 366b0f9..0883ca0 100644 --- a/crates/nvisy-pattern/src/engine/mod.rs +++ b/crates/nvisy-pattern/src/engine/mod.rs @@ -2,23 +2,23 @@ //! //! [`PatternEngine`] compiles all built-in (and optionally user-selected) //! regex patterns and dictionary automata into a single unit that can -//! scan text in one call. Use [`PatternEngineBuilder`] for configuration -//! or [`default_engine`] for an out-of-the-box singleton. +//! scan text in one call. Use [`PatternEngine::builder`] for configuration +//! or [`PatternEngine::instance`] for an out-of-the-box singleton. //! //! # Key types //! -//! - [`PatternEngine`]: the pre-compiled scanning engine. -//! - [`PatternEngineBuilder`]: builder for configuring patterns, thresholds, -//! and allow/deny lists. -//! - [`RawMatch`]: a single match produced by scanning. +//! - [`PatternEngine`]: pre-compiled scanning engine. +//! - [`ScanContext`]: per-scan allow/deny list configuration. +//! - [`RawMatch`]: single match produced by scanning. //! - [`AllowList`] / [`DenyList`]: exact-match suppression and forced detection. -//! - [`PatternEngineError`]: build-time errors. +//! - [`PatternEngineBuilder`]: builder for configuring patterns and thresholds. mod allow_list; mod builder; mod deny_list; mod error; mod pattern_match; +mod scan_context; use std::collections::HashSet; use std::sync::LazyLock; @@ -30,8 +30,8 @@ use regex::{Regex, RegexSet}; pub use self::allow_list::AllowList; pub use self::builder::PatternEngineBuilder; pub use self::deny_list::{DenyList, DenyRule}; -pub use self::error::PatternEngineError; pub use self::pattern_match::RawMatch; +pub use self::scan_context::ScanContext; use crate::patterns::{ContextRule, DictionaryConfidence}; use crate::validators::ValidatorResolver; @@ -58,8 +58,8 @@ struct DictEntry { /// The terms used to build the automaton, indexed by pattern id. values: Vec, /// Per-entry column index from the source dictionary (parallel to `values`). - /// `None` for plain-text dictionaries (all entries are column 0). - columns: Option>, + /// `None` entries indicate plain-text origin (logically column 0). + columns: Vec>, context: Option, } @@ -68,9 +68,10 @@ impl DictEntry { fn resolve_confidence(&self, pattern_index: usize) -> f64 { let col = self .columns - .as_ref() - .and_then(|cols| cols.get(pattern_index).copied()) - .unwrap_or(0); + .get(pattern_index) + .copied() + .flatten() + .unwrap_or(0) as usize; self.confidence.resolve(col) } } @@ -79,25 +80,23 @@ impl DictEntry { /// /// Scanning runs in three phases: /// -/// 1. **Regex** — a [`RegexSet`] pre-filter selects candidate patterns, +/// 1. **Regex**: a [`RegexSet`] pre-filter selects candidate patterns, /// then each matching regex extracts offsets and values. -/// 2. **Dictionary** — Aho-Corasick automata perform literal multi-pattern +/// 2. **Dictionary**: Aho-Corasick automata perform literal multi-pattern /// matching against known-value dictionaries. -/// 3. **Deny list** — known sensitive values not already matched are +/// 3. **Deny list**: known sensitive values not already matched are /// injected as synthetic matches with confidence `1.0`. /// /// Allow-list filtering is applied inline during phases 1 and 2. /// -/// Build via [`PatternEngine::builder`] or use [`default_engine`] for -/// the singleton with all built-in patterns. +/// Build via [`PatternEngine::builder`] or use [`PatternEngine::instance`] +/// for the singleton with all built-in patterns. pub struct PatternEngine { regex_set: RegexSet, regex_entries: Vec, dict_entries: Vec, validators: ValidatorResolver, confidence_threshold: f64, - allow_set: AllowList, - deny_set: DenyList, } impl std::fmt::Debug for PatternEngine { @@ -111,6 +110,12 @@ impl std::fmt::Debug for PatternEngine { } impl PatternEngine { + /// Return a reference to the lazily-initialised default engine + /// containing all built-in patterns. + pub fn instance() -> &'static Self { + &DEFAULT_ENGINE + } + /// Create a new [`PatternEngineBuilder`]. pub fn builder() -> PatternEngineBuilder { PatternEngineBuilder::default() @@ -121,21 +126,21 @@ impl PatternEngine { /// Matches whose value appears in the allow list are suppressed. /// Deny-list values found in the text are injected as synthetic matches /// with confidence `1.0` when not already matched. - #[tracing::instrument(target = TARGET, skip(self, text), fields(text_len = text.len(), matches = tracing::field::Empty))] - pub fn scan_text(&self, text: &str) -> Vec { + #[tracing::instrument(target = TARGET, skip(self, text, ctx), fields(text_len = text.len(), matches = tracing::field::Empty))] + pub fn scan_text(&self, text: &str, ctx: &ScanContext) -> Vec { let mut results = Vec::new(); - self.scan_regex(text, &mut results); - self.scan_dict(text, &mut results); - self.scan_deny_list(text, &mut results); + self.scan_regex(text, &ctx.allow, &mut results); + self.scan_dict(text, &ctx.allow, &mut results); + self.scan_deny_list(text, &ctx.deny, &mut results); tracing::Span::current().record("matches", results.len()); results } - /// Phase 1: regex matches — use `RegexSet` as a pre-filter, then run + /// Phase 1: regex matches. Uses `RegexSet` as a pre-filter, then runs /// each matching regex individually to extract offsets and values. - fn scan_regex(&self, text: &str, results: &mut Vec) { + fn scan_regex(&self, text: &str, allow: &AllowList, results: &mut Vec) { let set_matches = self.regex_set.matches(text); for idx in set_matches.iter() { let entry = &self.regex_entries[idx]; @@ -147,7 +152,7 @@ impl PatternEngine { for mat in entry.regex.find_iter(text) { let value = mat.as_str(); - if self.allow_set.contains(value) { + if allow.contains(value) { continue; } @@ -178,21 +183,19 @@ impl PatternEngine { } /// Phase 2: dictionary matches via Aho-Corasick automata. - fn scan_dict(&self, text: &str, results: &mut Vec) { + fn scan_dict(&self, text: &str, allow: &AllowList, results: &mut Vec) { for entry in &self.dict_entries { for mat in entry.automaton.find_iter(text) { let pat_idx = mat.pattern().as_usize(); let value = &entry.values[pat_idx]; - // Resolve per-entry confidence: use column override if available, - // otherwise fall back to the pattern's base confidence. let confidence = entry.resolve_confidence(pat_idx); if confidence < self.confidence_threshold { continue; } - if self.allow_set.contains(value.as_str()) { + if allow.contains(value.as_str()) { continue; } @@ -211,13 +214,13 @@ impl PatternEngine { } } - /// Phase 3: inject deny-list values found in `text` that were not - /// already matched by regex or dictionary. - fn scan_deny_list(&self, text: &str, results: &mut Vec) { + /// Phase 3: inject deny-list values found in `text` not already + /// matched by regex or dictionary. + fn scan_deny_list(&self, text: &str, deny: &DenyList, results: &mut Vec) { let matched_values: HashSet<&str> = results.iter().map(|r| r.value.as_str()).collect(); let mut deny_matches = Vec::new(); - for (deny_value, deny_rule) in self.deny_set.iter() { + for (deny_value, deny_rule) in deny.iter() { if matched_values.contains(deny_value) { continue; } @@ -249,26 +252,24 @@ static DEFAULT_ENGINE: LazyLock = LazyLock::new(|| { .expect("built-in patterns must compile") }); -/// Return a reference to the lazily-initialised default [`PatternEngine`] -/// containing all built-in patterns. -pub fn default_engine() -> &'static PatternEngine { - &DEFAULT_ENGINE -} - #[cfg(test)] mod tests { use super::*; + fn empty_ctx() -> ScanContext { + ScanContext::default() + } + #[test] fn default_engine_builds() { - let engine = default_engine(); + let engine = PatternEngine::instance(); assert!(!engine.regex_entries.is_empty()); } #[test] fn scan_text_finds_ssn() { - let engine = default_engine(); - let matches = engine.scan_text("My SSN is 123-45-6789."); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("My SSN is 123-45-6789.", &empty_ctx()); assert!( matches .iter() @@ -280,8 +281,8 @@ mod tests { #[test] fn scan_text_finds_email() { - let engine = default_engine(); - let matches = engine.scan_text("Contact: alice@example.com"); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("Contact: alice@example.com", &empty_ctx()); assert!( matches .iter() @@ -297,7 +298,7 @@ mod tests { .with_confidence_threshold(0.99) .build() .unwrap(); - let matches = engine.scan_text("My SSN is 123-45-6789."); + let matches = engine.scan_text("My SSN is 123-45-6789.", &empty_ctx()); assert!( !matches .iter() @@ -318,9 +319,9 @@ mod tests { #[test] fn scan_text_returns_correct_offsets() { - let engine = default_engine(); + let engine = PatternEngine::instance(); let text = "SSN: 123-45-6789"; - let matches = engine.scan_text(text); + let matches = engine.scan_text(text, &empty_ctx()); let ssn_match = matches .iter() .find(|m| m.pattern_name.as_deref() == Some("ssn")) @@ -330,8 +331,8 @@ mod tests { #[test] fn dictionary_matches_are_found() { - let engine = default_engine(); - let matches = engine.scan_text("She is American and speaks English."); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("She is American and speaks English.", &empty_ctx()); assert!( matches.iter().any(|m| m .recognition_methods @@ -348,10 +349,10 @@ mod tests { fn allow_list_suppresses_match() { let engine = PatternEngine::builder() .with_patterns(&["ssn"]) - .with_allow(AllowList::new().with("123-45-6789")) .build() .unwrap(); - let matches = engine.scan_text("SSN: 123-45-6789"); + let ctx = ScanContext::new().with_allow(AllowList::new().with("123-45-6789")); + let matches = engine.scan_text("SSN: 123-45-6789", &ctx); assert!( !matches .iter() @@ -364,15 +365,18 @@ mod tests { fn deny_list_injects_match() { let deny = DenyList::new().with( "secret-value-42", - EntityCategory::PersonalIdentity, - EntityKind::PersonName, + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + }, ); let engine = PatternEngine::builder() .with_patterns(&["email"]) - .with_deny(deny) .build() .unwrap(); - let matches = engine.scan_text("The secret-value-42 should be detected."); + let ctx = ScanContext::new().with_deny(deny); + let matches = engine.scan_text("The secret-value-42 should be detected.", &ctx); let deny_match = matches .iter() .find(|m| m.pattern_name.is_none()) @@ -380,25 +384,25 @@ mod tests { assert_eq!(deny_match.value, "secret-value-42"); assert_eq!(deny_match.confidence, 1.0); assert_eq!(deny_match.entity_kind, EntityKind::PersonName); - assert_eq!( - deny_match.recognition_methods, - vec![RecognitionMethod::Dictionary] - ); + assert_eq!(deny_match.recognition_methods, vec![RecognitionMethod::Ner]); } #[test] fn deny_list_not_injected_when_absent() { let deny = DenyList::new().with( "not-in-text", - EntityCategory::PersonalIdentity, - EntityKind::PersonName, + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Manual, + }, ); let engine = PatternEngine::builder() .with_patterns(&["email"]) - .with_deny(deny) .build() .unwrap(); - let matches = engine.scan_text("Nothing special here."); + let ctx = ScanContext::new().with_deny(deny); + let matches = engine.scan_text("Nothing special here.", &ctx); assert!( !matches.iter().any(|m| m.pattern_name.is_none()), "deny list value not in text should not be injected" @@ -415,28 +419,36 @@ mod tests { } #[test] - fn deny_list_from_iterator() { - let deny: DenyList = [ - ( - "secret", - EntityCategory::PersonalIdentity, - EntityKind::PersonName, - ), - ("other", EntityCategory::Financial, EntityKind::PaymentCard), - ] - .into_iter() - .collect(); + fn deny_list_insert_and_lookup() { + let mut deny = DenyList::new(); + deny.insert( + "secret", + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + }, + ); + deny.insert( + "other", + DenyRule { + category: EntityCategory::Financial, + entity_kind: EntityKind::PaymentCard, + method: RecognitionMethod::Manual, + }, + ); assert_eq!(deny.len(), 2); assert!(deny.contains("secret")); let rule = deny.get("other").unwrap(); assert_eq!(rule.category, EntityCategory::Financial); + assert_eq!(rule.method, RecognitionMethod::Manual); } #[test] fn column_confidence_applies_to_csv_dictionaries() { - let engine = default_engine(); + let engine = PatternEngine::instance(); // "US Dollar" is column 0 (full name), "USD" is column 1 (code). - let matches = engine.scan_text("I paid in US Dollar and also in USD."); + let matches = engine.scan_text("I paid in US Dollar and also in USD.", &empty_ctx()); let full_name = matches.iter().find(|m| m.value == "US Dollar"); let code = matches.iter().find(|m| m.value == "USD"); assert!(full_name.is_some(), "should match 'US Dollar'"); @@ -455,7 +467,7 @@ mod tests { .with_patterns(&["ssn"]) .build() .unwrap(); - let matches = engine.scan_text("SSN: 123-45-6789"); + let matches = engine.scan_text("SSN: 123-45-6789", &empty_ctx()); let ssn_match = matches .iter() .find(|m| m.pattern_name.as_deref() == Some("ssn")) diff --git a/crates/nvisy-pattern/src/engine/pattern_match.rs b/crates/nvisy-pattern/src/engine/pattern_match.rs index 91c38ef..8af6afb 100644 --- a/crates/nvisy-pattern/src/engine/pattern_match.rs +++ b/crates/nvisy-pattern/src/engine/pattern_match.rs @@ -1,4 +1,4 @@ -//! [`RawMatch`] — output type from pattern scanning. +//! [`RawMatch`]: output type from pattern scanning. use nvisy_ontology::entity::{Entity, EntityCategory, EntityKind, RecognitionMethod}; @@ -8,7 +8,7 @@ use crate::patterns::ContextRule; #[derive(Debug, Clone)] pub struct RawMatch { /// Name of the pattern that produced this match, or `None` for - /// deny-list injected matches. + /// deny-list–injected matches. pub pattern_name: Option, /// Entity category of the match. pub category: EntityCategory, @@ -36,12 +36,13 @@ impl RawMatch { /// The returned entity has no location or parent set: the caller /// should attach those from the span context via /// [`Entity::with_location`] and [`Entity::with_parent`]. + /// /// # Panics /// /// Panics if `recognition_methods` is empty. All engine-produced /// matches always carry at least one method. pub fn into_entity(self) -> Entity { - assert!( + debug_assert!( !self.recognition_methods.is_empty(), "RawMatch::into_entity requires at least one recognition method" ); diff --git a/crates/nvisy-pattern/src/engine/scan_context.rs b/crates/nvisy-pattern/src/engine/scan_context.rs new file mode 100644 index 0000000..19e8176 --- /dev/null +++ b/crates/nvisy-pattern/src/engine/scan_context.rs @@ -0,0 +1,50 @@ +//! [`ScanContext`]: per-scan allow/deny list configuration. + +use super::allow_list::AllowList; +use super::deny_list::DenyList; + +/// Per-scan configuration for allow and deny lists. +/// +/// Passed to [`PatternEngine::scan_text`](super::PatternEngine::scan_text) +/// to control per-invocation suppression and forced detection without +/// rebuilding the engine. +/// +/// # Examples +/// +/// ```rust,ignore +/// use nvisy_pattern::prelude::*; +/// use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; +/// +/// let ctx = ScanContext::new() +/// .with_allow(AllowList::new().with("000-00-0000")) +/// .with_deny(DenyList::new().with("secret", DenyRule { +/// category: EntityCategory::PersonalIdentity, +/// entity_kind: EntityKind::PersonName, +/// method: RecognitionMethod::Manual, +/// })); +/// let matches = PatternEngine::instance().scan_text("text", &ctx); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct ScanContext { + pub(super) allow: AllowList, + pub(super) deny: DenyList, +} + +impl ScanContext { + /// Create an empty scan context (no allow/deny filtering). + pub fn new() -> Self { + Self::default() + } + + /// Set the allow list. + pub fn with_allow(mut self, list: AllowList) -> Self { + self.allow = list; + self + } + + /// Set the deny list. + pub fn with_deny(mut self, list: DenyList) -> Self { + self.deny = list; + self + } +} diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index fc1e04e..f355580 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -3,13 +3,13 @@ #![doc = include_str!("../README.md")] pub mod dictionaries; -pub mod engine; -pub(crate) mod patterns; +pub(crate) mod engine; +pub mod patterns; pub(crate) mod validators; -pub use self::dictionaries::{DictionaryLoadError, DictionaryRegistry}; -pub use self::engine::{PatternEngine, PatternEngineBuilder, RawMatch}; -pub use self::patterns::ContextRule; +pub use self::engine::{ + AllowList, DenyList, DenyRule, PatternEngine, PatternEngineBuilder, RawMatch, ScanContext, +}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-pattern/src/patterns/context_rule.rs b/crates/nvisy-pattern/src/patterns/context_rule.rs index 843c1ee..1e596bd 100644 --- a/crates/nvisy-pattern/src/patterns/context_rule.rs +++ b/crates/nvisy-pattern/src/patterns/context_rule.rs @@ -1,11 +1,11 @@ -//! [`ContextRule`] — co-occurrence context for span-level confidence boosting. +//! [`ContextRule`]: co-occurrence context for span-level confidence boosting. use serde::Deserialize; /// Co-occurrence context rule for span-level confidence boosting. /// /// When a pattern match is found, nearby spans are searched for any of the -/// `keywords`. If at least one keyword is present within `window` spans, +/// `keywords`. If at least one keyword is present within `window` spans, /// the match confidence is increased by `boost` (clamped to `[0.0, 1.0]`). #[derive(Debug, Clone, Deserialize)] #[serde(try_from = "RawContextRule")] @@ -19,12 +19,12 @@ pub struct ContextRule { pub boost: f64, /// Whether keyword matching is case-sensitive. /// - /// Defaults to `false` (case-insensitive). + /// Defaults to `false`: case-insensitive. pub case_sensitive: bool, } /// Serde intermediary that mirrors the JSON shape before validation. -#[derive(Deserialize)] +#[derive(Debug, Clone, Deserialize)] struct RawContextRule { keywords: Vec, #[serde(default = "default_window")] diff --git a/crates/nvisy-pattern/src/patterns/json_pattern.rs b/crates/nvisy-pattern/src/patterns/json_pattern.rs index a7cfc0d..cc7634d 100644 --- a/crates/nvisy-pattern/src/patterns/json_pattern.rs +++ b/crates/nvisy-pattern/src/patterns/json_pattern.rs @@ -1,9 +1,9 @@ -//! JSON-backed `JsonPattern` implementation. +//! JSON-backed [`JsonPattern`] implementation. //! //! Each JSON file under `assets/patterns/` is deserialized into a -//! `JsonPattern` via `from_bytes`. The method returns the validated -//! pattern together with any non-fatal `JsonPatternWarning`s so the -//! caller can decide how to surface them. +//! [`JsonPattern`] via [`from_bytes`](JsonPattern::from_bytes). The method +//! returns the validated pattern together with any non-fatal +//! [`JsonPatternWarning`]s so the caller can decide how to surface them. use nvisy_ontology::entity::{EntityCategory, EntityKind}; use serde::Deserialize; @@ -34,7 +34,7 @@ pub enum JsonPatternWarning { /// A detection pattern deserialized from a JSON definition file. /// /// Implements the [`Pattern`] trait and is the only concrete implementation -/// shipped with this crate. Construct via `from_bytes`. +/// shipped with this crate. Construct via [`from_bytes`](Self::from_bytes). #[derive(Debug, Clone)] pub struct JsonPattern { name: String, @@ -48,7 +48,7 @@ impl JsonPattern { /// Deserialize and validate a pattern from raw JSON bytes. /// /// `validators` is used to check whether a referenced validator name - /// is registered; unrecognised names produce a [`JsonPatternWarning`] + /// is registered: unrecognised names produce a [`JsonPatternWarning`] /// but do not prevent loading. /// /// On success returns the pattern together with a (possibly empty) @@ -63,7 +63,7 @@ impl JsonPattern { bytes: &[u8], validators: &ValidatorResolver, ) -> Result<(Self, Vec), JsonPatternError> { - /// Serde helper: exactly one of `pattern` or `dictionary`. + /// Serde helper: exactly one of `pattern` or `dictionary` must be present. #[derive(Deserialize)] #[serde(untagged)] enum RawSource { diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index ff0e5db..7adc4e6 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -1,7 +1,7 @@ //! Built-in detection patterns. //! //! Each pattern is a JSON file under `assets/patterns/` that describes how -//! to detect a single entity type. Files are embedded at compile time with +//! to detect a single entity type. Files are embedded at compile time with //! `include_dir!` and auto-discovered by [`PatternRegistry::load_builtins`]. //! //! # Key types @@ -10,258 +10,17 @@ //! - [`JsonPattern`]: concrete implementation deserialized from JSON. //! - [`MatchSource`]: whether matching is regex-based or dictionary-based. //! - [`ContextRule`]: optional co-occurrence keywords for confidence boosting. -//! - [`PatternRegistry`]: sorted collection with O(log n) lookup by name. +//! - [`PatternRegistry`]: sorted collection with O(log n) lookup. //! - [`JsonPatternWarning`]: non-fatal load-time diagnostics. mod context_rule; mod json_pattern; mod pattern; - -use std::collections::BTreeMap; -use std::sync::LazyLock; - -use include_dir::{Dir, include_dir}; +mod pattern_error; +mod pattern_registry; pub use self::context_rule::ContextRule; pub use self::json_pattern::{JsonPattern, JsonPatternWarning}; pub use self::pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; -use crate::validators::ValidatorResolver; - -const TARGET: &str = "nvisy_pattern::patterns"; - -/// A registry of named [`Pattern`] definitions with O(log n) lookup. -/// -/// Use [`load_builtins`] to create a registry pre-populated with -/// the compile-time-embedded pattern files. -/// -/// [`load_builtins`]: Self::load_builtins -pub struct PatternRegistry { - inner: BTreeMap, -} - -impl std::fmt::Debug for PatternRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); - f.debug_struct("PatternRegistry") - .field("len", &self.inner.len()) - .field("names", &names) - .finish() - } -} - -impl PatternRegistry { - /// Create an empty registry. - pub fn new() -> Self { - Self { - inner: BTreeMap::new(), - } - } - - /// Insert a pattern, keyed by its [`Pattern::name`]. - pub fn insert(&mut self, pattern: BoxPattern) { - let name = pattern.name().to_owned(); - self.inner.insert(name, pattern); - } - - /// Look up a pattern by name. - #[must_use] - pub fn get(&self, name: &str) -> Option<&dyn Pattern> { - self.inner.get(name).map(|b| b.as_ref()) - } - - /// Iterate over all registered patterns as `&dyn Pattern` in - /// deterministic (alphabetical) order. - pub fn iter(&self) -> impl Iterator { - self.inner.values().map(|b| b.as_ref()) - } - - /// Iterate over all registered pattern names. - pub fn names(&self) -> impl Iterator { - self.inner.keys().map(|s| s.as_str()) - } - - /// Total number of registered patterns. - #[must_use] - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Whether the registry contains no patterns. - #[must_use] - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - /// Load all `.json` files from the embedded `assets/patterns/` - /// directory and return a populated registry. - /// - /// Files that fail to parse are logged as warnings and skipped. - #[tracing::instrument(target = TARGET, name = "patterns.load_builtins", fields(count))] - pub fn load_builtins() -> Self { - static PATTERN_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/patterns"); - - let validators = ValidatorResolver::builtins(); - let mut reg = Self::new(); - - for file in PATTERN_DIR.files() { - let path = file.path(); - - let Some("json") = path.extension().and_then(|e| e.to_str()) else { - tracing::warn!( - target: TARGET, - path = %path.display(), - "skipping non-JSON file in patterns directory", - ); - continue; - }; - - let (pattern, warnings) = match JsonPattern::from_bytes(file.contents(), &validators) { - Ok(pair) => pair, - Err(e) => { - tracing::warn!( - target: TARGET, - path = %path.display(), - error = %e, - "failed to load pattern, skipping", - ); - continue; - } - }; - - for w in &warnings { - match w { - JsonPatternWarning::UnknownValidator { pattern, validator } => { - tracing::warn!(target: TARGET, %pattern, %validator, "unknown validator name, pattern will have no post-match validation"); - } - } - } - - tracing::trace!( - target: TARGET, - name = %pattern.name(), - category = %pattern.category(), - entity_kind = %pattern.entity_kind(), - match_source = ?pattern.match_source(), - "pattern loaded", - ); - reg.insert(Box::new(pattern)); - } - - tracing::Span::current().record("count", reg.len()); - tracing::debug!(target: TARGET, "built-in patterns loaded"); - reg - } -} - -impl Default for PatternRegistry { - fn default() -> Self { - Self::new() - } -} - -static BUILTIN_REGISTRY: LazyLock = LazyLock::new(PatternRegistry::load_builtins); - -/// Return a reference to the lazily-initialised built-in [`PatternRegistry`]. -pub fn builtin_registry() -> &'static PatternRegistry { - &BUILTIN_REGISTRY -} - -#[cfg(test)] -mod tests { - use super::pattern::RegexPattern; - use super::*; - - fn registry() -> &'static PatternRegistry { - builtin_registry() - } - - #[test] - fn builtins_load() { - assert!(!registry().is_empty()); - } - - #[test] - fn pattern_names_are_sorted() { - let names: Vec<&str> = registry().names().collect(); - let mut sorted = names.clone(); - sorted.sort(); - assert_eq!(names, sorted); - } - - #[test] - fn no_duplicate_pattern_names() { - let names: Vec<_> = registry().names().collect(); - let unique: std::collections::HashSet<_> = names.iter().collect(); - assert_eq!(names.len(), unique.len(), "duplicate pattern names found"); - } - - #[test] - fn all_patterns_have_valid_fields() { - for p in registry().iter() { - assert!(!p.name().is_empty(), "pattern name is empty"); - match p.match_source() { - MatchSource::Regex(rp) => { - assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()); - assert!(rp.confidence > 0.0, "confidence is 0 for {}", p.name()); - assert!(rp.confidence <= 1.0, "confidence > 1 for {}", p.name()); - } - MatchSource::Dictionary(dp) => { - assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()); - let c = dp.confidence.resolve(0); - assert!(c > 0.0, "confidence is 0 for {}", p.name()); - assert!(c <= 1.0, "confidence > 1 for {}", p.name()); - } - } - } - } - - #[test] - fn all_regex_patterns_compile() { - for p in registry().iter() { - if let MatchSource::Regex(rp) = p.match_source() { - assert!( - regex::Regex::new(&rp.effective_regex()).is_ok(), - "pattern {} failed to compile: {}", - p.name(), - rp.regex, - ); - } - } - } - - #[test] - fn all_validators_resolve() { - let resolver = crate::validators::ValidatorResolver::builtins(); - for p in registry().iter() { - if let MatchSource::Regex(RegexPattern { - validator: Some(name), - .. - }) = p.match_source() - { - assert!( - resolver.resolve(name).is_some(), - "pattern {} references unregistered validator {name}", - p.name(), - ); - } - } - } - - #[test] - fn registry_insert_and_get() { - let validators = ValidatorResolver::builtins(); - let json = br#"{ - "name": "test", - "category": "personal_identity", - "entity_type": "government_id", - "pattern": { "regex": "\\d+", "confidence": 0.9 } - }"#; - let (pattern, _warnings) = JsonPattern::from_bytes(json, &validators).unwrap(); - - let mut reg = PatternRegistry::new(); - reg.insert(Box::new(pattern)); - - assert_eq!(reg.len(), 1); - assert_eq!(reg.get("test").unwrap().name(), "test"); - } -} +pub(crate) use self::pattern_error::PatternLoadError; +pub use self::pattern_registry::{PatternRegistry, builtin_registry}; diff --git a/crates/nvisy-pattern/src/patterns/pattern.rs b/crates/nvisy-pattern/src/patterns/pattern.rs index 4f1091c..1d074d8 100644 --- a/crates/nvisy-pattern/src/patterns/pattern.rs +++ b/crates/nvisy-pattern/src/patterns/pattern.rs @@ -1,8 +1,4 @@ -//! Core [`Pattern`] trait, [`MatchSource`] enum, and [`BoxPattern`] alias. -//! -//! [`Pattern`]: crate::patterns::Pattern -//! [`MatchSource`]: crate::patterns::MatchSource -//! [`BoxPattern`]: crate::patterns::BoxPattern +//! Core [`Pattern`] trait, [`MatchSource`] enum, and [`BoxPattern`] type alias. use nvisy_ontology::entity::{EntityCategory, EntityKind}; use serde::Deserialize; @@ -14,7 +10,7 @@ use super::context_rule::ContextRule; pub struct RegexPattern { /// The regular expression string. pub regex: String, - /// Optional validator name (e.g. `"luhn"`, `"ssn"`, `"iban"`), + /// Optional validator name (e.g. `"luhn"`, `"ssn"`, `"iban"`): /// resolved at detection time via [`ValidatorResolver`]. /// /// [`ValidatorResolver`]: crate::validators::ValidatorResolver @@ -22,7 +18,7 @@ pub struct RegexPattern { pub validator: Option, /// Whether the regex is case-sensitive. /// - /// Defaults to `true`. When `false`, the regex is compiled with + /// Defaults to `true`. When `false`, the regex is compiled with /// an inline `(?i)` prefix. #[serde(default = "default_case_sensitive")] pub case_sensitive: bool, @@ -76,7 +72,7 @@ impl Default for DictionaryConfidence { } } -/// Serde helper — accepts either a single number or an array of numbers. +/// Serde helper: accepts either a single number or an array of numbers. mod confidence_serde { use serde::{Deserialize, Deserializer}; @@ -108,15 +104,15 @@ pub struct DictionaryPattern { pub name: String, /// Whether matching is case-sensitive. /// - /// Defaults to `false`. Controls the Aho-Corasick automaton's + /// Defaults to `false`. Controls the Aho-Corasick automaton's /// `ascii_case_insensitive` setting. #[serde(default)] pub case_sensitive: bool, /// Confidence score(s) for matches from this dictionary. /// - /// A single number applies uniformly to all entries. - /// An array assigns per-column confidence for CSV dictionaries - /// (e.g. `[0.85, 0.55]` gives column 0 entries 0.85 and column 1 + /// A single number applies uniformly to all entries. An array + /// assigns per-column confidence for CSV dictionaries (e.g. + /// `[0.85, 0.55]` gives column 0 entries 0.85 and column 1 /// entries 0.55). /// /// Defaults to `1.0` when not specified. diff --git a/crates/nvisy-pattern/src/patterns/pattern_error.rs b/crates/nvisy-pattern/src/patterns/pattern_error.rs new file mode 100644 index 0000000..8f29f1e --- /dev/null +++ b/crates/nvisy-pattern/src/patterns/pattern_error.rs @@ -0,0 +1,42 @@ +//! Error type for pattern filesystem loading. + +use nvisy_core::{Error, ErrorKind}; + +use super::json_pattern::JsonPatternError; + +/// Error returned when loading patterns from the filesystem. +#[derive(Debug, thiserror::Error)] +pub enum PatternLoadError { + /// The directory could not be read. + #[error("failed to read pattern directory '{}': {source}", path.display())] + ReadDir { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A pattern file could not be read. + #[error("failed to read pattern file '{}': {source}", path.display())] + ReadFile { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A pattern file failed to parse. + #[error("failed to parse pattern '{}': {source}", path.display())] + Parse { + path: std::path::PathBuf, + source: JsonPatternError, + }, +} + +impl From for Error { + fn from(err: PatternLoadError) -> Self { + let kind = match &err { + PatternLoadError::ReadDir { .. } | PatternLoadError::ReadFile { .. } => { + ErrorKind::Internal + } + PatternLoadError::Parse { .. } => ErrorKind::Validation, + }; + Error::new(kind, err.to_string()) + .with_component("nvisy-pattern::patterns") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/patterns/pattern_registry.rs b/crates/nvisy-pattern/src/patterns/pattern_registry.rs new file mode 100644 index 0000000..6a66818 --- /dev/null +++ b/crates/nvisy-pattern/src/patterns/pattern_registry.rs @@ -0,0 +1,405 @@ +//! [`PatternRegistry`]: named pattern collection with O(log n) lookup. + +use std::collections::BTreeMap; +use std::path::Path; +use std::sync::LazyLock; + +use include_dir::{Dir, include_dir}; + +use super::{BoxPattern, JsonPattern, JsonPatternWarning, Pattern, PatternLoadError}; +use crate::validators::ValidatorResolver; + +const TARGET: &str = "nvisy_pattern::patterns"; + +/// A registry of named [`Pattern`] definitions with O(log n) lookup. +/// +/// Use [`load_builtins`] to populate with the compile-time-embedded +/// pattern files, or [`load_dir`] / [`load_file`] to load from the +/// filesystem at runtime. +/// +/// [`load_builtins`]: Self::load_builtins +/// [`load_dir`]: Self::load_dir +/// [`load_file`]: Self::load_file +#[derive(Default)] +pub struct PatternRegistry { + inner: BTreeMap, +} + +impl std::fmt::Debug for PatternRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); + f.debug_struct("PatternRegistry") + .field("len", &self.inner.len()) + .field("names", &names) + .finish() + } +} + +impl PatternRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self::default() + } + + /// Insert a pattern, keyed by its [`Pattern::name`]. + pub fn insert(&mut self, pattern: BoxPattern) { + let name = pattern.name().to_owned(); + self.inner.insert(name, pattern); + } + + /// Look up a pattern by name. + #[must_use] + pub fn get(&self, name: &str) -> Option<&dyn Pattern> { + self.inner.get(name).map(|b| b.as_ref()) + } + + /// Iterate over all registered patterns as `&dyn Pattern` in + /// deterministic (alphabetical) order. + pub fn iter(&self) -> impl Iterator { + self.inner.values().map(|b| b.as_ref()) + } + + /// Iterate over all registered pattern names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) + } + + /// Total number of registered patterns. + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Whether the registry contains no patterns. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Load all `.json` files from the embedded `assets/patterns/` + /// directory into this registry. + /// + /// Files that fail to parse are logged as warnings and skipped. + #[tracing::instrument(target = TARGET, name = "patterns.load_builtins", skip(self), fields(count))] + pub fn load_builtins(&mut self) { + static PATTERN_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/patterns"); + + let validators = ValidatorResolver::builtins(); + + for file in PATTERN_DIR.files() { + let path = file.path(); + + let Some("json") = path.extension().and_then(|e| e.to_str()) else { + tracing::warn!( + target: TARGET, + path = %path.display(), + "skipping non-JSON file in patterns directory", + ); + continue; + }; + + let (pattern, warnings) = match JsonPattern::from_bytes(file.contents(), &validators) { + Ok(pair) => pair, + Err(e) => { + tracing::warn!( + target: TARGET, + path = %path.display(), + error = %e, + "failed to load pattern, skipping", + ); + continue; + } + }; + + Self::log_warnings(&warnings); + + tracing::trace!( + target: TARGET, + name = %pattern.name(), + category = %pattern.category(), + entity_kind = %pattern.entity_kind(), + match_source = ?pattern.match_source(), + "pattern loaded", + ); + self.insert(Box::new(pattern)); + } + + tracing::Span::current().record("count", self.len()); + tracing::debug!(target: TARGET, "built-in patterns loaded"); + } + + /// Load a single `.json` pattern file and insert it. + /// + /// The pattern name is derived from the JSON `"name"` field, not + /// the file name. Files with non-`.json` extensions are logged as + /// warnings and ignored (no error is returned). + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the file cannot be read or + /// the JSON content cannot be parsed. + #[tracing::instrument(target = TARGET, name = "patterns.load_file", skip_all, fields(path = %path.as_ref().display()))] + pub fn load_file(&mut self, path: impl AsRef) -> nvisy_core::Result<()> { + let path = path.as_ref(); + + let Some("json") = path.extension().and_then(|e| e.to_str()) else { + tracing::warn!( + target: TARGET, + path = %path.display(), + "skipping non-JSON pattern file", + ); + return Ok(()); + }; + + let bytes = std::fs::read(path).map_err(|source| PatternLoadError::ReadFile { + path: path.to_owned(), + source, + })?; + + let validators = ValidatorResolver::builtins(); + let (pattern, warnings) = + JsonPattern::from_bytes(&bytes, &validators).map_err(|source| { + PatternLoadError::Parse { + path: path.to_owned(), + source, + } + })?; + + Self::log_warnings(&warnings); + + tracing::trace!( + target: TARGET, + name = %pattern.name(), + category = %pattern.category(), + entity_kind = %pattern.entity_kind(), + match_source = ?pattern.match_source(), + "pattern loaded from filesystem", + ); + self.insert(Box::new(pattern)); + Ok(()) + } + + /// Load all `.json` files from a filesystem directory. + /// + /// Non-`.json` files are logged as warnings and skipped. Loaded + /// patterns are inserted into `self`, so this can be called after + /// [`load_builtins`](Self::load_builtins) to layer user-provided + /// patterns on top of the built-ins. + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the directory cannot be read, + /// a file cannot be read, or a JSON file fails to parse. + #[tracing::instrument(target = TARGET, name = "patterns.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] + pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { + let dir = dir.as_ref(); + + let entries = std::fs::read_dir(dir).map_err(|source| PatternLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + + let mut count = 0usize; + for entry in entries { + let entry = entry.map_err(|source| PatternLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + let path = entry.path(); + + if !path.is_file() { + continue; + } + + self.load_file(&path)?; + count += 1; + } + + tracing::Span::current().record("count", count); + tracing::debug!(target: TARGET, "filesystem patterns loaded"); + Ok(()) + } + + fn log_warnings(warnings: &[JsonPatternWarning]) { + for w in warnings { + match w { + JsonPatternWarning::UnknownValidator { pattern, validator } => { + tracing::warn!( + target: TARGET, + %pattern, + %validator, + "unknown validator name, pattern will have no post-match validation", + ); + } + } + } + } +} + +static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { + let mut reg = PatternRegistry::new(); + reg.load_builtins(); + reg +}); + +/// Return a reference to the lazily-initialised built-in [`PatternRegistry`]. +pub fn builtin_registry() -> &'static PatternRegistry { + &BUILTIN_REGISTRY +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::super::json_pattern::JsonPattern; + use super::super::pattern::{MatchSource, RegexPattern}; + use super::*; + use crate::validators::ValidatorResolver; + + fn registry() -> &'static PatternRegistry { + builtin_registry() + } + + #[test] + fn builtins_load() { + assert!(!registry().is_empty()); + } + + #[test] + fn pattern_names_are_sorted() { + let names: Vec<&str> = registry().names().collect(); + let mut sorted = names.clone(); + sorted.sort(); + assert_eq!(names, sorted); + } + + #[test] + fn no_duplicate_pattern_names() { + let names: Vec<_> = registry().names().collect(); + let unique: HashSet<_> = names.iter().collect(); + assert_eq!(names.len(), unique.len(), "duplicate pattern names found"); + } + + #[test] + fn all_patterns_have_valid_fields() { + for p in registry().iter() { + assert!(!p.name().is_empty(), "pattern name is empty"); + match p.match_source() { + MatchSource::Regex(rp) => { + assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()); + assert!(rp.confidence > 0.0, "confidence is 0 for {}", p.name()); + assert!(rp.confidence <= 1.0, "confidence > 1 for {}", p.name()); + } + MatchSource::Dictionary(dp) => { + assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()); + let c = dp.confidence.resolve(0); + assert!(c > 0.0, "confidence is 0 for {}", p.name()); + assert!(c <= 1.0, "confidence > 1 for {}", p.name()); + } + } + } + } + + #[test] + fn all_regex_patterns_compile() { + for p in registry().iter() { + if let MatchSource::Regex(rp) = p.match_source() { + assert!( + regex::Regex::new(&rp.effective_regex()).is_ok(), + "pattern {} failed to compile: {}", + p.name(), + rp.regex, + ); + } + } + } + + #[test] + fn all_validators_resolve() { + let resolver = ValidatorResolver::builtins(); + for p in registry().iter() { + if let MatchSource::Regex(RegexPattern { + validator: Some(name), + .. + }) = p.match_source() + { + assert!( + resolver.resolve(name).is_some(), + "pattern {} references unregistered validator {name}", + p.name(), + ); + } + } + } + + #[test] + fn registry_insert_and_get() { + let validators = ValidatorResolver::builtins(); + let json = br#"{ + "name": "test", + "category": "personal_identity", + "entity_type": "government_id", + "pattern": { "regex": "\\d+", "confidence": 0.9 } + }"#; + let (pattern, _warnings) = JsonPattern::from_bytes(json, &validators).unwrap(); + + let mut reg = PatternRegistry::new(); + reg.insert(Box::new(pattern)); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("test").unwrap().name(), "test"); + } + + #[test] + fn load_dir_reads_filesystem() { + let dir = tempfile::tempdir().unwrap(); + + std::fs::write( + dir.path().join("test_pattern.json"), + r#"{ + "name": "test_fs", + "category": "personal_identity", + "entity_type": "government_id", + "pattern": { "regex": "\\d{3}", "confidence": 0.8 } + }"#, + ) + .unwrap(); + // Should be skipped. + std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); + + let mut reg = PatternRegistry::new(); + reg.load_dir(dir.path()).unwrap(); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("test_fs").unwrap().name(), "test_fs"); + } + + #[test] + fn load_dir_missing_directory() { + let mut reg = PatternRegistry::new(); + let result = reg.load_dir("/nonexistent/path"); + assert!(result.is_err()); + } + + #[test] + fn load_file_single_pattern() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("single.json"); + std::fs::write( + &path, + r#"{ + "name": "single_test", + "category": "contact_info", + "entity_type": "email_address", + "pattern": { "regex": ".+@.+", "confidence": 0.7 } + }"#, + ) + .unwrap(); + + let mut reg = PatternRegistry::new(); + reg.load_file(&path).unwrap(); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("single_test").unwrap().name(), "single_test"); + } +} diff --git a/crates/nvisy-pattern/src/prelude.rs b/crates/nvisy-pattern/src/prelude.rs index 3594e8f..c15685a 100644 --- a/crates/nvisy-pattern/src/prelude.rs +++ b/crates/nvisy-pattern/src/prelude.rs @@ -5,6 +5,5 @@ //! ``` pub use crate::{ - ContextRule, DictionaryLoadError, DictionaryRegistry, PatternEngine, PatternEngineBuilder, - RawMatch, + AllowList, DenyList, DenyRule, PatternEngine, PatternEngineBuilder, RawMatch, ScanContext, }; diff --git a/crates/nvisy-pattern/src/validators/luhn.rs b/crates/nvisy-pattern/src/validators/luhn.rs index dd257fd..0d26d89 100644 --- a/crates/nvisy-pattern/src/validators/luhn.rs +++ b/crates/nvisy-pattern/src/validators/luhn.rs @@ -2,7 +2,7 @@ //! //! Implements the [Luhn algorithm](https://en.wikipedia.org/wiki/Luhn_algorithm) //! used to validate credit/debit card numbers and other identification -//! numbers. Only digits, spaces, and dashes are accepted as input: any +//! numbers. Only digits, spaces, and dashes are accepted as input: any //! other character causes the check to fail. /// Return `true` if `num` passes the Luhn checksum. diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index cbac8a0..4c1e476 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -1,7 +1,7 @@ //! Post-match validators for detected entity values. //! //! Patterns can reference a validator by name (e.g. `"validator": "luhn"`) -//! to reduce false positives. At detection time the name is resolved to a +//! to reduce false positives. At detection time the name is resolved to a //! [`ValidatorFn`] via [`ValidatorResolver`]. mod iban; @@ -14,18 +14,15 @@ pub use self::iban::validate_iban; pub use self::luhn::luhn_check; pub use self::ssn::validate_ssn; -/// Signature for a validation function: takes the matched text and returns -/// `true` if the value is valid. +/// Validation function signature: takes matched text, returns `true` if +/// the value is valid. pub type ValidatorFn = fn(&str) -> bool; /// Maps validator names to [`ValidatorFn`]s. /// -/// Created with the built-in validators via [`builtins`] (or -/// [`Default`]), then optionally extended with [`register`] for -/// custom validators. -/// -/// [`builtins`]: Self::builtins -/// [`register`]: Self::register +/// Created with the built-in validators via [`builtins`](Self::builtins) +/// (or [`Default`]), then optionally extended with +/// [`register`](Self::register) for custom validators. #[derive(Debug, Clone)] pub struct ValidatorResolver { table: HashMap<&'static str, ValidatorFn>, diff --git a/crates/nvisy-server/src/handler/error/from_core.rs b/crates/nvisy-server/src/handler/error/from_core.rs index 68644d3..1bacfef 100644 --- a/crates/nvisy-server/src/handler/error/from_core.rs +++ b/crates/nvisy-server/src/handler/error/from_core.rs @@ -19,7 +19,7 @@ impl From for Error<'static> { }; let mut error = Self::new(kind).with_message(err.message); - if let Some(component) = err.source_component { + if let Some(component) = err.component { error = error.with_context(component); } error From fa36fb3ab4ed5fdf68dbe4f1e445627da7bc2d2f Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Thu, 12 Mar 2026 23:54:02 +0100 Subject: [PATCH 08/10] refactor(registry): add observability, split modules, and improve API naming Add tracing instrumentation to all public methods, .with_component() on all errors, #![forbid(unsafe_code)] and docsrs attributes. Split flat store/ module into handler/ (ContentHandle, ContextHandle) and registry/ (Registry). Deduplicate composite key construction and prefix-scan helpers. Rename parameters to use _id suffix (actor_id, content_id, context_id) and document all types, fields, and methods. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + crates/nvisy-registry/Cargo.toml | 3 + crates/nvisy-registry/src/handler/content.rs | 154 ++++ crates/nvisy-registry/src/handler/context.rs | 104 +++ .../src/{store => handler}/mod.rs | 4 +- crates/nvisy-registry/src/lib.rs | 21 +- crates/nvisy-registry/src/registry/mod.rs | 867 ++++++++++++++++++ crates/nvisy-registry/src/store/content.rs | 111 --- crates/nvisy-registry/src/store/context.rs | 77 -- crates/nvisy-registry/src/store/registry.rs | 653 ------------- 10 files changed, 1138 insertions(+), 857 deletions(-) create mode 100644 crates/nvisy-registry/src/handler/content.rs create mode 100644 crates/nvisy-registry/src/handler/context.rs rename crates/nvisy-registry/src/{store => handler}/mod.rs (63%) create mode 100644 crates/nvisy-registry/src/registry/mod.rs delete mode 100644 crates/nvisy-registry/src/store/content.rs delete mode 100644 crates/nvisy-registry/src/store/context.rs delete mode 100644 crates/nvisy-registry/src/store/registry.rs diff --git a/Cargo.lock b/Cargo.lock index 37cd3b6..4394e88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3075,6 +3075,7 @@ dependencies = [ "serde_json", "tempfile", "tokio", + "tracing", "uuid", ] diff --git a/crates/nvisy-registry/Cargo.toml b/crates/nvisy-registry/Cargo.toml index bb383c1..399ca45 100644 --- a/crates/nvisy-registry/Cargo.toml +++ b/crates/nvisy-registry/Cargo.toml @@ -33,6 +33,9 @@ fjall = { workspace = true, features = [] } # Async runtime and parallelism tokio = { workspace = true, features = ["sync", "rt"] } +# Observability +tracing = { workspace = true, features = [] } + # (De)serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-registry/src/handler/content.rs b/crates/nvisy-registry/src/handler/content.rs new file mode 100644 index 0000000..55e29dd --- /dev/null +++ b/crates/nvisy-registry/src/handler/content.rs @@ -0,0 +1,154 @@ +//! [`ContentHandle`]: async handle to stored content data and metadata. + +use std::fmt; + +use bytes::Bytes; +use fjall::Keyspace; +use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; +use nvisy_core::{Error, ErrorKind, Result}; +use uuid::Uuid; + +use crate::registry::composite_key; + +const COMPONENT: &str = "nvisy-registry::content"; + +/// Lightweight handle to a content entry stored in the registry. +/// +/// Holds references to the fjall keyspaces so it can read content data +/// and metadata on demand. Cloning is cheap: fjall handles are +/// internally `Arc`-wrapped. +#[derive(Clone)] +pub struct ContentHandle { + /// Actor identity that owns this content entry. + actor_id: Uuid, + /// Source identifier for the stored content. + content_source: ContentSource, + /// Keyspace storing raw content bytes. + content_ks: Keyspace, + /// Keyspace storing serialized content metadata. + content_meta_ks: Keyspace, +} + +impl fmt::Debug for ContentHandle { + /// Formats the handle for debugging, omitting keyspace internals. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContentHandle") + .field("actor_id", &self.actor_id) + .field("content_source", &self.content_source) + .finish_non_exhaustive() + } +} + +impl ContentHandle { + /// Creates a new handle from pre-resolved keyspaces. + /// + /// This is `pub(crate)` because only [`Registry`](crate::Registry) + /// should construct handles after verifying the entry exists. + pub(crate) fn new( + actor_id: Uuid, + content_source: ContentSource, + content_ks: Keyspace, + content_meta_ks: Keyspace, + ) -> Self { + Self { + actor_id, + content_source, + content_ks, + content_meta_ks, + } + } + + /// Returns the content source identifier. + #[must_use] + pub fn content_source(&self) -> ContentSource { + self.content_source + } + + /// Returns the actor ID that owns this content. + #[must_use] + pub fn actor_id(&self) -> Uuid { + self.actor_id + } + + /// Reads the content bytes from the store. + /// + /// The read is dispatched to a blocking thread via + /// [`spawn_blocking`](tokio::task::spawn_blocking) to avoid + /// blocking the async runtime on fjall I/O. + #[tracing::instrument( + target = COMPONENT, + name = "content.read_data", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.content_source.as_uuid()), + )] + pub async fn content_data(&self) -> Result { + let key = composite_key(self.actor_id, self.content_source.as_uuid()); + let source = self.content_source; + let ks = self.content_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read content data") + .with_component(COMPONENT) + .with_source(err) + })?; + + let guard = value.ok_or_else(|| { + Error::new( + ErrorKind::NotFound, + format!("content data not found: {}", source.as_uuid()), + ) + .with_component(COMPONENT) + })?; + + Ok(ContentData::new(source, Bytes::copy_from_slice(&guard))) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Reads the content metadata from the store. + /// + /// Returns [`ContentMetadata::default()`] when the metadata key + /// exists but has no value (e.g. content registered without metadata). + #[tracing::instrument( + target = COMPONENT, + name = "content.read_metadata", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.content_source.as_uuid()), + )] + pub async fn metadata(&self) -> Result { + let key = composite_key(self.actor_id, self.content_source.as_uuid()); + let ks = self.content_meta_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + + match value { + Some(guard) => serde_json::from_slice(&guard).map_err(|err| { + Error::new( + ErrorKind::Serialization, + "failed to deserialize content metadata", + ) + .with_component(COMPONENT) + .with_source(err) + }), + None => Ok(ContentMetadata::default()), + } + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } +} diff --git a/crates/nvisy-registry/src/handler/context.rs b/crates/nvisy-registry/src/handler/context.rs new file mode 100644 index 0000000..77cc3c9 --- /dev/null +++ b/crates/nvisy-registry/src/handler/context.rs @@ -0,0 +1,104 @@ +//! [`ContextHandle`]: async handle to a stored detection context. + +use std::fmt; + +use fjall::Keyspace; +use nvisy_core::content::ContentSource; +use nvisy_core::{Error, ErrorKind, Result}; +use nvisy_ontology::context::Context; +use uuid::Uuid; + +use crate::registry::composite_key; + +const COMPONENT: &str = "nvisy-registry::context"; + +/// Lightweight handle to a context entry stored in the registry. +/// +/// Holds a reference to the contexts keyspace so it can deserialize the +/// stored JSON on demand. Cloning is cheap: fjall handles are +/// internally `Arc`-wrapped. +#[derive(Clone)] +pub struct ContextHandle { + /// Actor identity that owns this context entry. + actor_id: Uuid, + /// Content source this context is associated with. + source: ContentSource, + /// Keyspace storing serialized context JSON. + contexts_ks: Keyspace, +} + +impl fmt::Debug for ContextHandle { + /// Formats the handle for debugging, omitting keyspace internals. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContextHandle") + .field("actor_id", &self.actor_id) + .field("source", &self.source) + .finish_non_exhaustive() + } +} + +impl ContextHandle { + /// Creates a new handle from a pre-resolved keyspace. + /// + /// This is `pub(crate)` because only [`Registry`](crate::Registry) + /// should construct handles after verifying the entry exists. + pub(crate) fn new(actor_id: Uuid, source: ContentSource, contexts_ks: Keyspace) -> Self { + Self { + actor_id, + source, + contexts_ks, + } + } + + /// Returns the content source identifier. + #[must_use] + pub fn source(&self) -> ContentSource { + self.source + } + + /// Returns the actor ID that owns this context. + #[must_use] + pub fn actor_id(&self) -> Uuid { + self.actor_id + } + + /// Reads and deserializes the context from the store. + /// + /// The read is dispatched to a blocking thread via + /// [`spawn_blocking`](tokio::task::spawn_blocking) to avoid + /// blocking the async runtime on fjall I/O. + #[tracing::instrument( + target = COMPONENT, + name = "context.read", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.source.as_uuid()), + )] + pub async fn context(&self) -> Result { + let key = composite_key(self.actor_id, self.source.as_uuid()); + let ks = self.contexts_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read context") + .with_component(COMPONENT) + .with_source(err) + })?; + + let guard = value.ok_or_else(|| { + Error::new(ErrorKind::NotFound, "context data not found").with_component(COMPONENT) + })?; + + serde_json::from_slice(&guard).map_err(|err| { + Error::new(ErrorKind::Serialization, "failed to deserialize context") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } +} diff --git a/crates/nvisy-registry/src/store/mod.rs b/crates/nvisy-registry/src/handler/mod.rs similarity index 63% rename from crates/nvisy-registry/src/store/mod.rs rename to crates/nvisy-registry/src/handler/mod.rs index b53d183..b8e68fb 100644 --- a/crates/nvisy-registry/src/store/mod.rs +++ b/crates/nvisy-registry/src/handler/mod.rs @@ -1,7 +1,7 @@ +//! Async handles for reading stored content and contexts. + mod content; mod context; -mod registry; pub use self::content::ContentHandle; pub use self::context::ContextHandle; -pub use self::registry::Registry; diff --git a/crates/nvisy-registry/src/lib.rs b/crates/nvisy-registry/src/lib.rs index 6e79dba..a45658c 100644 --- a/crates/nvisy-registry/src/lib.rs +++ b/crates/nvisy-registry/src/lib.rs @@ -1,19 +1,12 @@ -//! Actor-scoped content and context storage backed by fjall. -//! -//! This crate provides [`Registry`], a unified store that manages both -//! content files and detection contexts. Every resource is scoped by a -//! `Uuid` actor identity, so listing and reading are inherently -//! actor-isolated at the database level via composite keys. -//! -//! # Core Types -//! -//! - [`Registry`]: Shared, clonable handle to the fjall database -//! - [`ContentHandle`]: Lightweight async handle to stored content -//! - [`ContextHandle`]: Lightweight async handle to a stored context +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] -mod store; +mod handler; +mod registry; #[doc(hidden)] pub mod prelude; -pub use self::store::{ContentHandle, ContextHandle, Registry}; +pub use self::handler::{ContentHandle, ContextHandle}; +pub use self::registry::Registry; diff --git a/crates/nvisy-registry/src/registry/mod.rs b/crates/nvisy-registry/src/registry/mod.rs new file mode 100644 index 0000000..31e1b43 --- /dev/null +++ b/crates/nvisy-registry/src/registry/mod.rs @@ -0,0 +1,867 @@ +//! [`Registry`]: actor-scoped content and context store backed by fjall. + +use std::path::{Path, PathBuf}; + +use fjall::{Database, Keyspace, KeyspaceCreateOptions, KvSeparationOptions}; +use nvisy_core::content::{Content, ContentSource}; +use nvisy_core::{Error, ErrorKind, Result}; +use nvisy_ontology::context::Context; +use uuid::Uuid; + +use crate::handler::{ContentHandle, ContextHandle}; + +const TARGET: &str = "nvisy_registry"; +const COMPONENT: &str = "nvisy-registry"; + +/// Builds a 32-byte composite key: `[actor_id: 16][resource_id: 16]`. +/// +/// Used by both [`ContentHandle`] and [`ContextHandle`] to scope every +/// read/write to a specific actor. +pub(crate) fn composite_key(actor_id: Uuid, resource_id: Uuid) -> [u8; 32] { + let mut key = [0u8; 32]; + key[..16].copy_from_slice(actor_id.as_bytes()); + key[16..].copy_from_slice(resource_id.as_bytes()); + key +} + +/// Actor-scoped content and context store backed by fjall. +/// +/// Stores content data, content metadata, and contexts in three keyspaces. +/// Every key is a 32-byte composite of `[actor_id][resource_id]`, so all +/// operations are inherently scoped to a single actor. +/// +/// All handles are internally `Arc`-wrapped, making `Registry` cheap to +/// clone and safe to share across threads. +#[derive(Clone)] +pub struct Registry { + /// Filesystem path where the fjall database is stored. + base_dir: PathBuf, + /// Underlying fjall database handle. + db: Database, + /// Keyspace for raw content bytes (blob-separated). + content_ks: Keyspace, + /// Keyspace for serialized content metadata. + content_meta_ks: Keyspace, + /// Keyspace for serialized detection contexts. + contexts_ks: Keyspace, +} + +impl std::fmt::Debug for Registry { + /// Formats the registry for debugging, showing only the base directory. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Registry") + .field("base_dir", &self.base_dir) + .finish_non_exhaustive() + } +} + +impl Registry { + /// Opens (or creates) the fjall database at `path`. + /// + /// Three keyspaces are created: + /// - `"content"`: blob separation for efficient large-value storage + /// - `"content_meta"`: default configuration + /// - `"contexts"`: default configuration + /// + /// # Errors + /// + /// Returns an error if the database or keyspaces cannot be opened. + #[tracing::instrument(target = TARGET, name = "registry.open", fields(path = %path.as_ref().display()))] + pub fn open(path: impl AsRef) -> Result { + let base_dir = path.as_ref().to_path_buf(); + + let db = Database::builder(&base_dir).open().map_err(|err| { + Error::new( + ErrorKind::Internal, + format!("failed to open database: {}", base_dir.display()), + ) + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_ks = db + .keyspace("content", || { + KeyspaceCreateOptions::default() + .with_kv_separation(Some(KvSeparationOptions::default())) + }) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open content keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_meta_ks = db + .keyspace("content_meta", KeyspaceCreateOptions::default) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open content_meta keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + let contexts_ks = db + .keyspace("contexts", KeyspaceCreateOptions::default) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open contexts keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + tracing::debug!(target: TARGET, "registry opened"); + + Ok(Self { + base_dir, + db, + content_ks, + content_meta_ks, + contexts_ks, + }) + } + + // -- Content operations -------------------------------------------------- + + /// Registers content, writing its bytes and metadata to the store. + /// + /// Returns a [`ContentHandle`] for subsequent reads. + /// + /// # Errors + /// + /// Returns an error if serialization or the underlying write fails. + #[tracing::instrument( + target = TARGET, + name = "registry.register_content", + skip(self, content), + fields(actor_id = %actor_id), + )] + pub async fn register_content( + &self, + actor_id: Uuid, + content: Content, + ) -> Result { + let content_source = content.content_source(); + let key = composite_key(actor_id, content_source.as_uuid()); + let data = content.as_bytes().to_vec(); + + let (_, content_metadata) = content.into_parts(); + let meta_bytes = + serde_json::to_vec(&content_metadata.unwrap_or_default()).map_err(|err| { + Error::new( + ErrorKind::Serialization, + "failed to serialize content metadata", + ) + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + content_ks.insert(key, &data).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.insert(key, &meta_bytes).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::trace!( + target: TARGET, + source_id = %content_source.as_uuid(), + "content registered", + ); + + Ok(ContentHandle::new( + actor_id, + content_source, + self.content_ks.clone(), + self.content_meta_ks.clone(), + )) + } + + /// Looks up previously registered content by actor and content ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.read_content", + skip(self), + fields(actor_id = %actor_id, content_id = %content_id), + )] + pub async fn read_content(&self, actor_id: Uuid, content_id: Uuid) -> Result { + let key = composite_key(actor_id, content_id); + let ks = self.content_ks.clone(); + + let exists = tokio::task::spawn_blocking(move || -> Result { + ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check content key") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("content not found: actor_id={actor_id}, content_id={content_id}"), + ) + .with_component(COMPONENT)); + } + + let source = ContentSource::from(content_id); + Ok(ContentHandle::new( + actor_id, + source, + self.content_ks.clone(), + self.content_meta_ks.clone(), + )) + } + + /// Removes a single content entry (data + metadata) by actor and content ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_content", + skip(self), + fields(actor_id = %actor_id, content_id = %content_id), + )] + pub async fn unregister_content(&self, actor_id: Uuid, content_id: Uuid) -> Result<()> { + let key = composite_key(actor_id, content_id); + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + let exists = content_ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check content key") + .with_component(COMPONENT) + .with_source(err) + })?; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("content not found: actor_id={actor_id}, content_id={content_id}"), + ) + .with_component(COMPONENT)); + } + + content_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Removes all content entries (data + metadata) for an actor. + /// + /// Returns the number of entries removed. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_all_content", + skip(self), + fields(actor_id = %actor_id, removed), + )] + pub async fn unregister_all_content(&self, actor_id: Uuid) -> Result { + let prefix = actor_id.as_bytes().to_vec(); + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + let count = tokio::task::spawn_blocking(move || -> Result { + let keys = collect_prefix_keys(&content_ks, &prefix)?; + let count = keys.len(); + + for key in &keys { + content_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + if count > 0 { + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + Ok(count) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::Span::current().record("removed", count); + Ok(count) + } + + /// Lists all content IDs for an actor, sorted in ascending order. + #[tracing::instrument( + target = TARGET, + name = "registry.list_content", + skip(self), + fields(actor_id = %actor_id), + )] + pub async fn list_content(&self, actor_id: Uuid) -> Result> { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.content_ks.clone(); + + tokio::task::spawn_blocking(move || extract_resource_ids(&ks, &prefix)) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + // -- Context operations -------------------------------------------------- + + /// Registers a context, serializing it as JSON. + /// + /// Returns a [`ContextHandle`] for subsequent reads. + /// + /// # Errors + /// + /// Returns an error if serialization or the underlying write fails. + #[tracing::instrument( + target = TARGET, + name = "registry.register_context", + skip(self, context), + fields(actor_id = %actor_id), + )] + pub async fn register_context( + &self, + actor_id: Uuid, + context: Context, + ) -> Result { + let source = context.source; + let key = composite_key(actor_id, source.as_uuid()); + + let json_bytes = serde_json::to_vec(&context).map_err(|err| { + Error::new(ErrorKind::Serialization, "failed to serialize context") + .with_component(COMPONENT) + .with_source(err) + })?; + + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + ks.insert(key, &json_bytes).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write context") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::trace!( + target: TARGET, + source_id = %source.as_uuid(), + "context registered", + ); + + Ok(ContextHandle::new( + actor_id, + source, + self.contexts_ks.clone(), + )) + } + + /// Looks up a previously registered context by actor and context ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.read_context", + skip(self), + fields(actor_id = %actor_id, context_id = %context_id), + )] + pub async fn read_context(&self, actor_id: Uuid, context_id: Uuid) -> Result { + let key = composite_key(actor_id, context_id); + let ks = self.contexts_ks.clone(); + + let exists = tokio::task::spawn_blocking(move || -> Result { + ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check context key") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("context not found: actor_id={actor_id}, context_id={context_id}"), + ) + .with_component(COMPONENT)); + } + + let source = ContentSource::from(context_id); + Ok(ContextHandle::new( + actor_id, + source, + self.contexts_ks.clone(), + )) + } + + /// Removes a single context entry by actor and context ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_context", + skip(self), + fields(actor_id = %actor_id, context_id = %context_id), + )] + pub async fn unregister_context(&self, actor_id: Uuid, context_id: Uuid) -> Result<()> { + let key = composite_key(actor_id, context_id); + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + let exists = ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check context key") + .with_component(COMPONENT) + .with_source(err) + })?; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("context not found: actor_id={actor_id}, context_id={context_id}"), + ) + .with_component(COMPONENT)); + } + + ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove context") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Removes all context entries for an actor. + /// + /// Returns the number of entries removed. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_all_contexts", + skip(self), + fields(actor_id = %actor_id, removed), + )] + pub async fn unregister_all_contexts(&self, actor_id: Uuid) -> Result { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + let count = tokio::task::spawn_blocking(move || -> Result { + let keys = collect_prefix_keys(&ks, &prefix)?; + let count = keys.len(); + + for key in &keys { + ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove context") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + if count > 0 { + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + Ok(count) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::Span::current().record("removed", count); + Ok(count) + } + + /// Lists all context IDs for an actor, sorted in ascending order. + #[tracing::instrument( + target = TARGET, + name = "registry.list_contexts", + skip(self), + fields(actor_id = %actor_id), + )] + pub async fn list_contexts(&self, actor_id: Uuid) -> Result> { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.contexts_ks.clone(); + + tokio::task::spawn_blocking(move || extract_resource_ids(&ks, &prefix)) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Returns the base directory path where the database is stored. + #[must_use] + pub fn base_dir(&self) -> &Path { + &self.base_dir + } +} + +/// Collects all raw keys from a keyspace that share the given prefix. +fn collect_prefix_keys(ks: &Keyspace, prefix: &[u8]) -> Result>> { + ks.prefix(prefix) + .map(|guard| { + let key = guard.key().map_err(|err| { + Error::new(ErrorKind::Internal, "failed to iterate keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(key.to_vec()) + }) + .collect() +} + +/// Extracts sorted resource UUIDs from the trailing 16 bytes of each +/// 32-byte composite key that shares the given prefix. +fn extract_resource_ids(ks: &Keyspace, prefix: &[u8]) -> Result> { + let mut ids = Vec::new(); + for guard in ks.prefix(prefix) { + let key = guard.key().map_err(|err| { + Error::new(ErrorKind::Internal, "failed to iterate keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + if key.len() == 32 + && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) + { + ids.push(Uuid::from_bytes(bytes)); + } + } + ids.sort(); + Ok(ids) +} + +#[cfg(test)] +mod tests { + use nvisy_core::content::{Content, ContentData}; + use nvisy_ontology::context::Context; + + use super::*; + + /// Opens a temporary registry backed by a fresh [`tempfile::TempDir`]. + fn open_temp_registry() -> (tempfile::TempDir, Registry) { + let temp = tempfile::TempDir::new().unwrap(); + let registry = Registry::open(temp.path().join("data")).unwrap(); + (temp, registry) + } + + #[tokio::test] + async fn register_and_read_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let content = Content::new(ContentData::from("Hello, world!")); + + let handle = registry.register_content(actor_id, content).await.unwrap(); + let data = handle.content_data().await.unwrap(); + assert_eq!(data.as_str().unwrap(), "Hello, world!"); + } + + #[tokio::test] + async fn content_scoped_by_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + let content = Content::new(ContentData::from("actor A only")); + let handle = registry.register_content(actor_a, content).await.unwrap(); + let content_id = handle.content_source().as_uuid(); + + // Actor B cannot see actor A's content. + let err = registry + .read_content(actor_b, content_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + + // Actor A can. + registry.read_content(actor_a, content_id).await.unwrap(); + } + + #[tokio::test] + async fn list_content_per_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + registry + .register_content(actor_a, Content::new(ContentData::from("a1"))) + .await + .unwrap(); + registry + .register_content(actor_a, Content::new(ContentData::from("a2"))) + .await + .unwrap(); + registry + .register_content(actor_b, Content::new(ContentData::from("b1"))) + .await + .unwrap(); + + assert_eq!(registry.list_content(actor_a).await.unwrap().len(), 2); + assert_eq!(registry.list_content(actor_b).await.unwrap().len(), 1); + } + + #[tokio::test] + async fn unregister_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let content = Content::new(ContentData::from("delete me")); + let content_id = content.content_source().as_uuid(); + registry.register_content(actor_id, content).await.unwrap(); + + registry + .unregister_content(actor_id, content_id) + .await + .unwrap(); + + let err = registry + .read_content(actor_id, content_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + } + + #[tokio::test] + async fn unregister_all_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_content(actor_id, Content::new(ContentData::from("first"))) + .await + .unwrap(); + registry + .register_content(actor_id, Content::new(ContentData::from("second"))) + .await + .unwrap(); + + let deleted = registry.unregister_all_content(actor_id).await.unwrap(); + assert_eq!(deleted, 2); + assert!(registry.list_content(actor_id).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn register_and_read_context() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let ctx = Context::new("test-context", vec![]); + + let handle = registry + .register_context(actor_id, ctx.clone()) + .await + .unwrap(); + let read_ctx = handle.context().await.unwrap(); + assert_eq!(read_ctx.name, "test-context"); + } + + #[tokio::test] + async fn context_scoped_by_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + let ctx = Context::new("private", vec![]); + let handle = registry.register_context(actor_a, ctx).await.unwrap(); + let context_id = handle.source().as_uuid(); + + let err = registry + .read_context(actor_b, context_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + + registry.read_context(actor_a, context_id).await.unwrap(); + } + + #[tokio::test] + async fn list_contexts_per_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_context(actor_id, Context::new("ctx-1", vec![])) + .await + .unwrap(); + registry + .register_context(actor_id, Context::new("ctx-2", vec![])) + .await + .unwrap(); + + assert_eq!(registry.list_contexts(actor_id).await.unwrap().len(), 2); + } + + #[tokio::test] + async fn unregister_context() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let ctx = Context::new("remove-me", vec![]); + let context_id = ctx.source.as_uuid(); + + registry.register_context(actor_id, ctx).await.unwrap(); + registry + .unregister_context(actor_id, context_id) + .await + .unwrap(); + + let err = registry + .read_context(actor_id, context_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + } + + #[tokio::test] + async fn unregister_all_contexts() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_context(actor_id, Context::new("c1", vec![])) + .await + .unwrap(); + registry + .register_context(actor_id, Context::new("c2", vec![])) + .await + .unwrap(); + + let deleted = registry.unregister_all_contexts(actor_id).await.unwrap(); + assert_eq!(deleted, 2); + assert!(registry.list_contexts(actor_id).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn data_persists_across_reopen() { + let temp = tempfile::TempDir::new().unwrap(); + let path = temp.path().join("data"); + let actor_id = Uuid::now_v7(); + + let content = Content::new(ContentData::from("persistent")); + let content_id = content.content_source().as_uuid(); + + { + let registry = Registry::open(&path).unwrap(); + registry.register_content(actor_id, content).await.unwrap(); + } + + let registry = Registry::open(&path).unwrap(); + let handle = registry.read_content(actor_id, content_id).await.unwrap(); + let data = handle.content_data().await.unwrap(); + assert_eq!(data.as_str().unwrap(), "persistent"); + } + + #[tokio::test] + async fn base_dir() { + let temp = tempfile::TempDir::new().unwrap(); + let base = temp.path().join("data"); + let registry = Registry::open(&base).unwrap(); + assert_eq!(registry.base_dir(), base); + } +} diff --git a/crates/nvisy-registry/src/store/content.rs b/crates/nvisy-registry/src/store/content.rs deleted file mode 100644 index 9d50149..0000000 --- a/crates/nvisy-registry/src/store/content.rs +++ /dev/null @@ -1,111 +0,0 @@ -use std::fmt; - -use bytes::Bytes; -use fjall::Keyspace; -use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; -use nvisy_core::{Error, ErrorKind, Result}; -use uuid::Uuid; - -/// Lightweight handle to a content entry stored in the registry. -/// -/// Holds references to the fjall keyspaces so it can read content data -/// and metadata on demand. Cloning is cheap because fjall handles are -/// internally `Arc`-wrapped. -#[derive(Clone)] -pub struct ContentHandle { - actor: Uuid, - content_source: ContentSource, - content: Keyspace, - content_meta: Keyspace, -} - -impl fmt::Debug for ContentHandle { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ContentHandle") - .field("actor", &self.actor) - .field("content_source", &self.content_source) - .finish_non_exhaustive() - } -} - -impl ContentHandle { - pub(crate) fn new( - actor: Uuid, - content_source: ContentSource, - content: Keyspace, - content_meta: Keyspace, - ) -> Self { - Self { - actor, - content_source, - content, - content_meta, - } - } - - /// Returns the content source identifier. - pub fn content_source(&self) -> ContentSource { - self.content_source - } - - /// Returns the actor that owns this content. - pub fn actor(&self) -> Uuid { - self.actor - } - - /// Reads the content bytes from the store. - pub async fn content_data(&self) -> Result { - let key = self.composite_key(); - let source = self.content_source; - let content_ks = self.content.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = content_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read content data").with_source(err) - })?; - - let guard = value.ok_or_else(|| { - Error::new( - ErrorKind::NotFound, - format!("Content data not found (id: {})", source.as_uuid()), - ) - })?; - - Ok(ContentData::new(source, Bytes::copy_from_slice(&guard))) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Reads the content metadata from the store. - pub async fn metadata(&self) -> Result { - let key = self.composite_key(); - let meta_ks = self.content_meta.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = meta_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read content metadata").with_source(err) - })?; - - match value { - Some(guard) => serde_json::from_slice(&guard).map_err(|err| { - Error::new( - ErrorKind::Serialization, - "Failed to deserialize content metadata", - ) - .with_source(err) - }), - None => Ok(ContentMetadata::default()), - } - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - fn composite_key(&self) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(self.actor.as_bytes()); - key[16..].copy_from_slice(self.content_source.as_uuid().as_bytes()); - key - } -} diff --git a/crates/nvisy-registry/src/store/context.rs b/crates/nvisy-registry/src/store/context.rs deleted file mode 100644 index 6e690ff..0000000 --- a/crates/nvisy-registry/src/store/context.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::fmt; - -use fjall::Keyspace; -use nvisy_core::content::ContentSource; -use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::context::Context; -use uuid::Uuid; - -/// Lightweight handle to a context entry stored in the registry. -/// -/// Holds a reference to the contexts keyspace so it can deserialize the -/// stored JSON on demand. Cloning is cheap because fjall handles are -/// internally `Arc`-wrapped. -#[derive(Clone)] -pub struct ContextHandle { - actor: Uuid, - source: ContentSource, - contexts: Keyspace, -} - -impl fmt::Debug for ContextHandle { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ContextHandle") - .field("actor", &self.actor) - .field("source", &self.source) - .finish_non_exhaustive() - } -} - -impl ContextHandle { - pub(crate) fn new(actor: Uuid, source: ContentSource, contexts: Keyspace) -> Self { - Self { - actor, - source, - contexts, - } - } - - /// Returns the content source identifier. - pub fn source(&self) -> ContentSource { - self.source - } - - /// Returns the actor that owns this context. - pub fn actor(&self) -> Uuid { - self.actor - } - - /// Reads and deserializes the context from the store. - pub async fn context(&self) -> Result { - let key = self.composite_key(); - let ctx_ks = self.contexts.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = ctx_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read context").with_source(err) - })?; - - let guard = - value.ok_or_else(|| Error::new(ErrorKind::NotFound, "Context data not found"))?; - - serde_json::from_slice(&guard).map_err(|err| { - Error::new(ErrorKind::Serialization, "Failed to deserialize context") - .with_source(err) - }) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - fn composite_key(&self) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(self.actor.as_bytes()); - key[16..].copy_from_slice(self.source.as_uuid().as_bytes()); - key - } -} diff --git a/crates/nvisy-registry/src/store/registry.rs b/crates/nvisy-registry/src/store/registry.rs deleted file mode 100644 index 53a710b..0000000 --- a/crates/nvisy-registry/src/store/registry.rs +++ /dev/null @@ -1,653 +0,0 @@ -use std::path::{Path, PathBuf}; - -use fjall::{Database, Keyspace, KeyspaceCreateOptions, KvSeparationOptions}; -use nvisy_core::content::{Content, ContentSource}; -use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::context::Context; -use uuid::Uuid; - -use super::content::ContentHandle; -use super::context::ContextHandle; - -/// Builds a 32-byte composite key: `[actor: 16][resource_id: 16]`. -fn make_key(actor: Uuid, id: Uuid) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(actor.as_bytes()); - key[16..].copy_from_slice(id.as_bytes()); - key -} - -/// Actor-scoped content and context store backed by fjall. -/// -/// Stores content data, content metadata, and contexts in three keyspaces. -/// Every key is a 32-byte composite of `[actor_id][resource_id]`, so all -/// operations are inherently scoped to a single actor. -/// -/// All handles are internally `Arc`-wrapped, making `Registry` cheap to -/// clone and safe to share across threads. -#[derive(Clone)] -pub struct Registry { - base_dir: PathBuf, - db: Database, - content: Keyspace, - content_meta: Keyspace, - contexts: Keyspace, -} - -impl std::fmt::Debug for Registry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Registry") - .field("base_dir", &self.base_dir) - .finish_non_exhaustive() - } -} - -impl Registry { - /// Opens (or creates) the fjall database at `path`. - /// - /// Three keyspaces are created: - /// - `"content"` with blob separation for efficient large-value storage - /// - `"content_meta"` with default configuration - /// - `"contexts"` with default configuration - /// - /// # Errors - /// - /// Returns an error if the database or keyspaces cannot be opened. - pub fn open(path: impl Into) -> Result { - let base_dir = path.into(); - - let db = Database::builder(&base_dir).open().map_err(|err| { - Error::new( - ErrorKind::Internal, - format!( - "Failed to open registry database (path: {})", - base_dir.display() - ), - ) - .with_source(err) - })?; - - let content = db - .keyspace("content", || { - KeyspaceCreateOptions::default() - .with_kv_separation(Some(KvSeparationOptions::default())) - }) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open content keyspace").with_source(err) - })?; - - let content_meta = db - .keyspace("content_meta", KeyspaceCreateOptions::default) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open content_meta keyspace") - .with_source(err) - })?; - - let contexts = db - .keyspace("contexts", KeyspaceCreateOptions::default) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open contexts keyspace").with_source(err) - })?; - - Ok(Self { - base_dir, - db, - content, - content_meta, - contexts, - }) - } - - /// Registers content, writing its bytes and metadata to the store. - /// - /// Returns a [`ContentHandle`] for subsequent reads. - pub async fn register_content(&self, actor: Uuid, content: Content) -> Result { - let content_source = content.content_source(); - let key = make_key(actor, content_source.as_uuid()); - let data = content.as_bytes().to_vec(); - - let (_, content_metadata) = content.into_parts(); - let meta_bytes = - serde_json::to_vec(&content_metadata.unwrap_or_default()).map_err(|err| { - Error::new( - ErrorKind::Serialization, - "Failed to serialize content metadata", - ) - .with_source(err) - })?; - - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - content_ks.insert(key, &data).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write content data").with_source(err) - })?; - meta_ks.insert(key, &meta_bytes).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write content metadata").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - Ok(ContentHandle::new( - actor, - content_source, - self.content.clone(), - self.content_meta.clone(), - )) - } - - /// Looks up previously registered content by actor and content ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn read_content(&self, actor: Uuid, id: Uuid) -> Result { - let key = make_key(actor, id); - let content_ks = self.content.clone(); - - let exists = tokio::task::spawn_blocking(move || -> Result { - content_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check content key").with_source(err) - }) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Content not found (actor: {actor}, id: {id})"), - )); - } - - let source = ContentSource::from(id); - Ok(ContentHandle::new( - actor, - source, - self.content.clone(), - self.content_meta.clone(), - )) - } - - /// Removes a single content entry by actor and content ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn unregister_content(&self, actor: Uuid, id: Uuid) -> Result<()> { - let key = make_key(actor, id); - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - let exists = content_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check content key").with_source(err) - })?; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Content not found (actor: {actor}, id: {id})"), - )); - } - - content_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content data").with_source(err) - })?; - meta_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content metadata") - .with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Removes all content entries for an actor. - /// - /// Returns the number of entries removed. - pub async fn unregister_all_content(&self, actor: Uuid) -> Result { - let prefix = actor.as_bytes().to_vec(); - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let keys: Vec> = content_ks - .prefix(&prefix) - .map(|guard| { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate content keyspace") - .with_source(err) - })?; - Ok(key.to_vec()) - }) - .collect::>>()?; - - let count = keys.len(); - - for key in &keys { - content_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content data") - .with_source(err) - })?; - meta_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content metadata") - .with_source(err) - })?; - } - - if count > 0 { - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - } - - Ok(count) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Lists all content IDs for an actor. - pub async fn list_content(&self, actor: Uuid) -> Result> { - let prefix = actor.as_bytes().to_vec(); - let content_ks = self.content.clone(); - - tokio::task::spawn_blocking(move || -> Result> { - let mut ids = Vec::new(); - for guard in content_ks.prefix(&prefix) { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate content keyspace") - .with_source(err) - })?; - if key.len() == 32 - && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) - { - ids.push(Uuid::from_bytes(bytes)); - } - } - ids.sort(); - Ok(ids) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Registers a context, serializing it as JSON. - /// - /// Returns a [`ContextHandle`] for subsequent reads. - pub async fn register_context(&self, actor: Uuid, context: Context) -> Result { - let source = context.source; - let key = make_key(actor, source.as_uuid()); - - let json_bytes = serde_json::to_vec(&context).map_err(|err| { - Error::new(ErrorKind::Serialization, "Failed to serialize context").with_source(err) - })?; - - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - ctx_ks.insert(key, &json_bytes).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write context").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - Ok(ContextHandle::new(actor, source, self.contexts.clone())) - } - - /// Looks up a previously registered context by actor and context ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn read_context(&self, actor: Uuid, id: Uuid) -> Result { - let key = make_key(actor, id); - let ctx_ks = self.contexts.clone(); - - let exists = tokio::task::spawn_blocking(move || -> Result { - ctx_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check context key").with_source(err) - }) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Context not found (actor: {actor}, id: {id})"), - )); - } - - let source = ContentSource::from(id); - Ok(ContextHandle::new(actor, source, self.contexts.clone())) - } - - /// Removes a single context entry by actor and context ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn unregister_context(&self, actor: Uuid, id: Uuid) -> Result<()> { - let key = make_key(actor, id); - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - let exists = ctx_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check context key").with_source(err) - })?; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Context not found (actor: {actor}, id: {id})"), - )); - } - - ctx_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove context").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Removes all context entries for an actor. - /// - /// Returns the number of entries removed. - pub async fn unregister_all_contexts(&self, actor: Uuid) -> Result { - let prefix = actor.as_bytes().to_vec(); - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let keys: Vec> = ctx_ks - .prefix(&prefix) - .map(|guard| { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate contexts keyspace") - .with_source(err) - })?; - Ok(key.to_vec()) - }) - .collect::>>()?; - - let count = keys.len(); - - for key in &keys { - ctx_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove context").with_source(err) - })?; - } - - if count > 0 { - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - } - - Ok(count) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Lists all context IDs for an actor. - pub async fn list_contexts(&self, actor: Uuid) -> Result> { - let prefix = actor.as_bytes().to_vec(); - let ctx_ks = self.contexts.clone(); - - tokio::task::spawn_blocking(move || -> Result> { - let mut ids = Vec::new(); - for guard in ctx_ks.prefix(&prefix) { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate contexts keyspace") - .with_source(err) - })?; - if key.len() == 32 - && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) - { - ids.push(Uuid::from_bytes(bytes)); - } - } - ids.sort(); - Ok(ids) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Returns the base directory path (the database location). - pub fn base_dir(&self) -> &Path { - &self.base_dir - } -} - -#[cfg(test)] -mod tests { - use nvisy_core::content::{Content, ContentData}; - use nvisy_ontology::context::Context; - - use super::*; - - fn open_temp_registry() -> (tempfile::TempDir, Registry) { - let temp = tempfile::TempDir::new().unwrap(); - let registry = Registry::open(temp.path().join("data")).unwrap(); - (temp, registry) - } - - #[tokio::test] - async fn register_and_read_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let content = Content::new(ContentData::from("Hello, world!")); - - let handle = registry.register_content(actor, content).await.unwrap(); - let data = handle.content_data().await.unwrap(); - assert_eq!(data.as_str().unwrap(), "Hello, world!"); - } - - #[tokio::test] - async fn content_scoped_by_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - let content = Content::new(ContentData::from("actor A only")); - let handle = registry.register_content(actor_a, content).await.unwrap(); - let id = handle.content_source().as_uuid(); - - // Actor B cannot see actor A's content - let err = registry.read_content(actor_b, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - - // Actor A can - registry.read_content(actor_a, id).await.unwrap(); - } - - #[tokio::test] - async fn list_content_per_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - registry - .register_content(actor_a, Content::new(ContentData::from("a1"))) - .await - .unwrap(); - registry - .register_content(actor_a, Content::new(ContentData::from("a2"))) - .await - .unwrap(); - registry - .register_content(actor_b, Content::new(ContentData::from("b1"))) - .await - .unwrap(); - - assert_eq!(registry.list_content(actor_a).await.unwrap().len(), 2); - assert_eq!(registry.list_content(actor_b).await.unwrap().len(), 1); - } - - #[tokio::test] - async fn unregister_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let content = Content::new(ContentData::from("delete me")); - let id = content.content_source().as_uuid(); - registry.register_content(actor, content).await.unwrap(); - - registry.unregister_content(actor, id).await.unwrap(); - - let err = registry.read_content(actor, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - } - - #[tokio::test] - async fn unregister_all_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_content(actor, Content::new(ContentData::from("first"))) - .await - .unwrap(); - registry - .register_content(actor, Content::new(ContentData::from("second"))) - .await - .unwrap(); - - let deleted = registry.unregister_all_content(actor).await.unwrap(); - assert_eq!(deleted, 2); - assert!(registry.list_content(actor).await.unwrap().is_empty()); - } - - #[tokio::test] - async fn register_and_read_context() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let ctx = Context::new("test-context", vec![]); - - let handle = registry.register_context(actor, ctx.clone()).await.unwrap(); - let read_ctx = handle.context().await.unwrap(); - assert_eq!(read_ctx.name, "test-context"); - } - - #[tokio::test] - async fn context_scoped_by_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - let ctx = Context::new("private", vec![]); - let handle = registry.register_context(actor_a, ctx).await.unwrap(); - let id = handle.source().as_uuid(); - - let err = registry.read_context(actor_b, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - - registry.read_context(actor_a, id).await.unwrap(); - } - - #[tokio::test] - async fn list_contexts_per_actor() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_context(actor, Context::new("ctx-1", vec![])) - .await - .unwrap(); - registry - .register_context(actor, Context::new("ctx-2", vec![])) - .await - .unwrap(); - - assert_eq!(registry.list_contexts(actor).await.unwrap().len(), 2); - } - - #[tokio::test] - async fn unregister_context() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let ctx = Context::new("remove-me", vec![]); - let id = ctx.source.as_uuid(); - - registry.register_context(actor, ctx).await.unwrap(); - registry.unregister_context(actor, id).await.unwrap(); - - let err = registry.read_context(actor, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - } - - #[tokio::test] - async fn unregister_all_contexts() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_context(actor, Context::new("c1", vec![])) - .await - .unwrap(); - registry - .register_context(actor, Context::new("c2", vec![])) - .await - .unwrap(); - - let deleted = registry.unregister_all_contexts(actor).await.unwrap(); - assert_eq!(deleted, 2); - assert!(registry.list_contexts(actor).await.unwrap().is_empty()); - } - - #[tokio::test] - async fn data_persists_across_reopen() { - let temp = tempfile::TempDir::new().unwrap(); - let path = temp.path().join("data"); - let actor = Uuid::now_v7(); - - let content = Content::new(ContentData::from("persistent")); - let id = content.content_source().as_uuid(); - - { - let registry = Registry::open(&path).unwrap(); - registry.register_content(actor, content).await.unwrap(); - } - - let registry = Registry::open(&path).unwrap(); - let handle = registry.read_content(actor, id).await.unwrap(); - let data = handle.content_data().await.unwrap(); - assert_eq!(data.as_str().unwrap(), "persistent"); - } - - #[tokio::test] - async fn base_dir() { - let temp = tempfile::TempDir::new().unwrap(); - let base = temp.path().join("data"); - let registry = Registry::open(&base).unwrap(); - assert_eq!(registry.base_dir(), base); - } -} From 189e60a6b985c539cbb5718d268f3e01634eca89 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 13 Mar 2026 11:44:16 +0100 Subject: [PATCH 09/10] refactor(python): replace NER/OCR/transcribe modules with ExifModule Remove the free-function NER, OCR, and transcription modules and introduce a structured ExifModule that takes ContentData (arc-backed, zero-copy) and ExifParams (Copy). Add tracing instrumentation to PythonBridge and ExifModule, improve docs, and clean up error messages. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + crates/nvisy-python/Cargo.toml | 7 +- crates/nvisy-python/src/bridge/error.rs | 7 +- crates/nvisy-python/src/bridge/mod.rs | 82 ++++++++++--- crates/nvisy-python/src/exif/mod.rs | 12 ++ crates/nvisy-python/src/exif/module.rs | 134 +++++++++++++++++++++ crates/nvisy-python/src/exif/params.rs | 19 +++ crates/nvisy-python/src/lib.rs | 4 +- crates/nvisy-python/src/ner/mod.rs | 139 ---------------------- crates/nvisy-python/src/ocr/mod.rs | 96 --------------- crates/nvisy-python/src/prelude.rs | 4 +- crates/nvisy-python/src/transcribe/mod.rs | 100 ---------------- 12 files changed, 242 insertions(+), 363 deletions(-) create mode 100644 crates/nvisy-python/src/exif/mod.rs create mode 100644 crates/nvisy-python/src/exif/module.rs create mode 100644 crates/nvisy-python/src/exif/params.rs delete mode 100644 crates/nvisy-python/src/ner/mod.rs delete mode 100644 crates/nvisy-python/src/ocr/mod.rs delete mode 100644 crates/nvisy-python/src/transcribe/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 4394e88..20460fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3061,6 +3061,7 @@ dependencies = [ "pythonize", "serde_json", "tokio", + "tracing", ] [[package]] diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 6899fe5..f3db0c7 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -2,8 +2,8 @@ [package] name = "nvisy-python" -description = "PyO3 bridge for AI NER/OCR detection via embedded Python" -keywords = ["nvisy", "python", "pyo3", "ner"] +description = "PyO3 bridge for Python-backed processing via embedded Python" +keywords = ["nvisy", "python", "pyo3", "exif"] categories = ["api-bindings"] readme = "README.md" @@ -35,6 +35,9 @@ hipstr = { workspace = true, features = [] } # Async runtime and parallelism tokio = { workspace = true, features = ["sync", "rt"] } +# Observability +tracing = { workspace = true, features = [] } + # Python interop pyo3 = { workspace = true, features = ["auto-initialize"] } pyo3-async-runtimes = { workspace = true, features = [] } diff --git a/crates/nvisy-python/src/bridge/error.rs b/crates/nvisy-python/src/bridge/error.rs index 3d0e685..c6a6b62 100644 --- a/crates/nvisy-python/src/bridge/error.rs +++ b/crates/nvisy-python/src/bridge/error.rs @@ -1,15 +1,16 @@ -//! Conversion utilities from Python errors to [`Error`]. +//! Conversion from Python errors to [`Error`]. use nvisy_core::Error; use pyo3::PyErr; use pyo3::types::PyTracebackMethods; -/// Convert a [`PyErr`] into an [`Error`], preserving the Python traceback when available. +/// Converts a [`PyErr`] into an [`Error`], preserving the Python +/// traceback when available. pub fn from_pyerr(err: PyErr) -> Error { pyo3::Python::with_gil(|py| { let traceback = err.traceback(py).map(|tb| tb.format().unwrap_or_default()); let msg = match traceback { - Some(tb) => format!("{}\n{}", err, tb), + Some(tb) => format!("{err}\n{tb}"), None => err.to_string(), }; Error::runtime(msg, "python", false) diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index cf3485d..1c381db 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -1,7 +1,7 @@ //! Lightweight handle to a Python module loaded via PyO3. //! -//! Provides [`PythonBridge`] — a thin wrapper that remembers which Python -//! module to import — plus helpers for calling synchronous and asynchronous +//! Provides [`PythonBridge`]: a thin wrapper that remembers which Python +//! module to import, plus helpers for calling synchronous and asynchronous //! Python functions from Rust async code. mod error; @@ -14,45 +14,75 @@ use serde_json::Value; pub use self::error::from_pyerr; -/// Lightweight handle to a Python NER module. +const TARGET: &str = "nvisy_python::bridge"; + +/// Lightweight handle to a Python module. /// -/// The bridge does **not** hold the GIL or any Python objects; it simply -/// remembers which module to `import` when a detection function is called. +/// The bridge does **not** hold the GIL or any Python objects: it simply +/// remembers which module to `import` when a function is called. /// The default module name is `"nvisy_ai"`. #[derive(Clone)] pub struct PythonBridge { - /// Dotted Python module name to import (e.g., `"nvisy_ai"`). + /// Dotted Python module name to import (e.g. `"nvisy_ai"`). module_name: HipStr<'static>, } +impl std::fmt::Debug for PythonBridge { + /// Formats the bridge for debugging, showing only the module name. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PythonBridge") + .field("module_name", &self.module_name.as_str()) + .finish() + } +} + impl PythonBridge { - /// Create a new bridge that will load the given Python module. + /// Creates a new bridge that will load the given Python module. pub fn new(module_name: impl Into>) -> Self { Self { module_name: module_name.into(), } } - /// Initialize Python and verify the module can be imported. + /// Initializes Python and verifies the module can be imported. + /// + /// # Errors + /// + /// Returns an error if the Python interpreter cannot be started or + /// the module cannot be imported. + #[tracing::instrument(target = TARGET, name = "bridge.init", skip(self), fields(module = %self.module_name))] pub fn init(&self) -> Result<(), Error> { Python::with_gil(|py| { py.import(&*self.module_name).map_err(from_pyerr)?; + tracing::debug!(target: TARGET, "python module imported"); Ok(()) }) } - /// Get the module name. + /// Returns the dotted Python module name. + #[must_use] pub fn module_name(&self) -> &str { &self.module_name } - /// Call a **synchronous** Python method on the bridge module inside + /// Calls a **synchronous** Python method on the bridge module inside /// `spawn_blocking` + `Python::with_gil`. /// - /// `build_kwargs` receives a GIL token and must return a [`PyDict`] of - /// keyword arguments. The method is invoked as - /// `module.(**, kwargs)` and the return value is deserialized + /// `build_kwargs` receives a GIL token and must return a [`PyDict`] + /// of keyword arguments. The method is invoked as + /// `module.(**kwargs)` and the return value is deserialized /// into `Vec`. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "bridge.call_sync", + skip(self, build_kwargs), + fields(module = %self.module_name, method), + )] pub async fn call_sync(&self, method: &str, build_kwargs: F) -> Result, Error> where F: FnOnce(Python<'_>) -> Result, Error> + Send + 'static, @@ -60,6 +90,8 @@ impl PythonBridge { let module_name = self.module_name.clone(); let method = method.to_string(); + tracing::Span::current().record("method", &method); + tokio::task::spawn_blocking(move || { Python::with_gil(|py| { let module = py.import(&*module_name).map_err(from_pyerr)?; @@ -71,7 +103,7 @@ impl PythonBridge { pythonize::depythonize::>(&result).map_err(|e| { Error::runtime( - format!("Failed to deserialize {} result: {}", method, e), + format!("failed to deserialize {method} result: {e}"), "python", false, ) @@ -79,17 +111,28 @@ impl PythonBridge { }) }) .await - .map_err(|e| Error::runtime(format!("Task join error: {}", e), "python", false))? + .map_err(|e| Error::runtime(format!("blocking task panicked: {e}"), "python", false))? } - /// Call an **asynchronous** (coroutine) Python method on the bridge + /// Calls an **asynchronous** (coroutine) Python method on the bridge /// module. /// /// Acquires the GIL, invokes `module.(**kwargs)` to obtain a /// Python coroutine, converts it to a Rust [`Future`] via /// [`pyo3_async_runtimes::tokio::into_future`], and awaits it on the - /// Tokio runtime. The coroutine's return value is deserialized into + /// Tokio runtime. The coroutine's return value is deserialized into /// `Vec`. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "bridge.call_async", + skip(self, build_kwargs), + fields(module = %self.module_name, method), + )] pub async fn call_async(&self, method: &str, build_kwargs: F) -> Result, Error> where F: FnOnce(Python<'_>) -> Result, Error> + Send + 'static, @@ -97,6 +140,8 @@ impl PythonBridge { use std::future::Future; use std::pin::Pin; + tracing::Span::current().record("method", method); + let future: Pin> + Send>> = Python::with_gil(|py| -> Result<_, Error> { let module = py.import(&*self.module_name).map_err(from_pyerr)?; @@ -116,7 +161,7 @@ impl PythonBridge { Python::with_gil(|py| { pythonize::depythonize::>(py_result.bind(py)).map_err(|e| { Error::runtime( - format!("Failed to deserialize {} result: {}", method, e), + format!("failed to deserialize {method} result: {e}"), "python", false, ) @@ -126,6 +171,7 @@ impl PythonBridge { } impl Default for PythonBridge { + /// Creates a bridge with the default module name `"nvisy_ai"`. fn default() -> Self { Self::new("nvisy_ai") } diff --git a/crates/nvisy-python/src/exif/mod.rs b/crates/nvisy-python/src/exif/mod.rs new file mode 100644 index 0000000..5c7975e --- /dev/null +++ b/crates/nvisy-python/src/exif/mod.rs @@ -0,0 +1,12 @@ +//! EXIF metadata extraction via the Python backend. +//! +//! Provides [`ExifModule`]: a configured handle that calls +//! `nvisy_ai.extract_exif()` through the [`PythonBridge`](crate::bridge::PythonBridge) +//! to extract EXIF metadata from images. Returns raw JSON values: +//! metadata construction is handled by the caller. + +mod module; +mod params; + +pub use self::module::ExifModule; +pub use self::params::ExifParams; diff --git a/crates/nvisy-python/src/exif/module.rs b/crates/nvisy-python/src/exif/module.rs new file mode 100644 index 0000000..db833f2 --- /dev/null +++ b/crates/nvisy-python/src/exif/module.rs @@ -0,0 +1,134 @@ +//! [`ExifModule`]: EXIF extraction via the Python bridge. + +use nvisy_core::Error; +use nvisy_core::content::ContentData; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use serde_json::Value; + +use super::params::ExifParams; +use crate::bridge::{PythonBridge, from_pyerr}; + +const TARGET: &str = "nvisy_python::exif"; + +/// Configured handle for EXIF metadata extraction. +/// +/// Holds a [`PythonBridge`] and [`ExifParams`] so callers do not need +/// to pass them on every invocation. +#[derive(Debug, Clone)] +pub struct ExifModule { + /// Python bridge used to call into the `nvisy_ai` module. + bridge: PythonBridge, + /// Extraction parameters applied to every call. + params: ExifParams, +} + +impl ExifModule { + /// Creates a new module with the given bridge and parameters. + pub fn new(bridge: PythonBridge, params: ExifParams) -> Self { + Self { bridge, params } + } + + /// Returns a reference to the underlying bridge. + #[must_use] + pub fn bridge(&self) -> &PythonBridge { + &self.bridge + } + + /// Returns a reference to the current parameters. + #[must_use] + pub fn params(&self) -> &ExifParams { + &self.params + } + + /// Calls Python `extract_exif()` synchronously via `spawn_blocking`. + /// + /// Returns raw JSON dicts containing EXIF tag key-value pairs. + /// The MIME type is resolved from `content.content_type()`, + /// defaulting to `"application/octet-stream"` when unavailable. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "exif.extract", + skip(self, content), + fields(data_len = content.size()), + )] + pub async fn extract(&self, content: ContentData) -> Result, Error> { + let request = ExifRequest::new(content, self.params); + + self.bridge + .call_sync("extract_exif", move |py| request.to_kwargs(py)) + .await + } + + /// Calls Python `extract_exif()` as a **coroutine** (async Python + /// function). + /// + /// Returns raw JSON dicts containing EXIF tag key-value pairs. + /// The MIME type is resolved from `content.content_type()`, + /// defaulting to `"application/octet-stream"` when unavailable. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "exif.extract_async", + skip(self, content), + fields(data_len = content.size()), + )] + pub async fn extract_async(&self, content: ContentData) -> Result, Error> { + let request = ExifRequest::new(content, self.params); + + self.bridge + .call_async("extract_exif", move |py| request.to_kwargs(py)) + .await + } +} + +/// Owned snapshot of a single EXIF extraction request. +/// +/// Wraps [`ContentData`] and [`ExifParams`] so they can be moved into +/// a `Send + 'static` closure for the bridge call. No extra allocations: +/// `ContentData` is internally arc-backed. +struct ExifRequest { + /// Content to extract EXIF metadata from. + content: ContentData, + /// Extraction parameters. + params: ExifParams, +} + +impl ExifRequest { + /// Creates a new request from content data and parameters. + fn new(content: ContentData, params: ExifParams) -> Self { + Self { content, params } + } + + /// Converts the request into a Python keyword arguments dict. + fn to_kwargs<'py>(&self, py: Python<'py>) -> Result, Error> { + let mime_type = self + .content + .content_type() + .unwrap_or("application/octet-stream"); + + let kwargs = PyDict::new(py); + kwargs + .set_item("image_bytes", self.content.as_bytes()) + .map_err(from_pyerr)?; + kwargs + .set_item("mime_type", mime_type) + .map_err(from_pyerr)?; + kwargs + .set_item("include_gps", self.params.include_gps) + .map_err(from_pyerr)?; + kwargs + .set_item("include_thumbnail", self.params.include_thumbnail) + .map_err(from_pyerr)?; + Ok(kwargs) + } +} diff --git a/crates/nvisy-python/src/exif/params.rs b/crates/nvisy-python/src/exif/params.rs new file mode 100644 index 0000000..0352b20 --- /dev/null +++ b/crates/nvisy-python/src/exif/params.rs @@ -0,0 +1,19 @@ +//! [`ExifParams`]: configuration for EXIF extraction calls. + +/// Parameters for EXIF extraction. +#[derive(Debug, Clone, Copy)] +pub struct ExifParams { + /// Whether to include GPS coordinates in the output. + pub include_gps: bool, + /// Whether to include thumbnail data in the output. + pub include_thumbnail: bool, +} + +impl Default for ExifParams { + fn default() -> Self { + Self { + include_gps: true, + include_thumbnail: false, + } + } +} diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index 96ee78e..54c3078 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -3,9 +3,7 @@ #![doc = include_str!("../README.md")] pub mod bridge; -pub mod ner; -pub mod ocr; -pub mod transcribe; +pub mod exif; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs deleted file mode 100644 index 71d568e..0000000 --- a/crates/nvisy-python/src/ner/mod.rs +++ /dev/null @@ -1,139 +0,0 @@ -//! Named-entity recognition (NER) detection via a Python AI backend. -//! -//! Functions in this module call into the Python `nvisy_ai` module via -//! [`PythonBridge`] and return raw JSON values. Entity construction is -//! handled by the pipeline's `NerBackend` / `DetectNerAction` layer. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for NER detection, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct NerParams { - /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). - pub entity_types: Vec, - /// Minimum confidence score to include a detection (0.0 -- 1.0). - pub confidence_threshold: f64, -} - -/// Call Python `detect_ner()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner( - bridge: &PythonBridge, - text: &str, - params: &NerParams, -) -> Result, Error> { - let text = text.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ner", move |py| { - let kwargs = PyDict::new(py); - kwargs.set_item("text", &text).map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner_image()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_image( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &NerParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ner_image", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_async( - bridge: &PythonBridge, - text: &str, - params: &NerParams, -) -> Result, Error> { - let text = text.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ner", move |py| { - let kwargs = PyDict::new(py); - kwargs.set_item("text", &text).map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner_image()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_image_async( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &NerParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ner_image", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs deleted file mode 100644 index 87c12b3..0000000 --- a/crates/nvisy-python/src/ocr/mod.rs +++ /dev/null @@ -1,96 +0,0 @@ -//! OCR text extraction via the Python backend. -//! -//! Calls `nvisy_ai.detect_ocr()` through the Python bridge to perform -//! optical character recognition on images, returning raw JSON values. -//! Entity construction is handled by the pipeline's `OcrBackend` / -//! `GenerateOcrAction` layer. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for OCR detection, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct OcrParams { - /// Language hint (e.g. `"eng"` for English). - pub language: String, - /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). - pub engine: String, - /// Minimum confidence threshold for OCR results. - pub confidence_threshold: f64, -} - -/// Call Python `detect_ocr()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ocr( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &OcrParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ocr", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item("engine", ¶ms.engine) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ocr()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ocr_async( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &OcrParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ocr", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item("engine", ¶ms.engine) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index e3b932b..60c6624 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -1,4 +1,4 @@ //! Convenience re-exports. + pub use crate::bridge::PythonBridge; -pub use crate::ner::NerParams; -pub use crate::ocr::OcrParams; +pub use crate::exif::{ExifModule, ExifParams}; diff --git a/crates/nvisy-python/src/transcribe/mod.rs b/crates/nvisy-python/src/transcribe/mod.rs deleted file mode 100644 index 54dc337..0000000 --- a/crates/nvisy-python/src/transcribe/mod.rs +++ /dev/null @@ -1,100 +0,0 @@ -//! Speech-to-text transcription via the Python backend. -//! -//! Calls `nvisy_ai.transcribe()` through the Python bridge to perform -//! speech transcription on audio, returning raw JSON values. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for transcription, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct TranscribeParams { - /// BCP-47 language tag for transcription. - pub language: String, - /// Whether to perform speaker diarization. - pub enable_speaker_diarization: bool, - /// Minimum confidence threshold for results. - pub confidence_threshold: f64, -} - -/// Call Python `transcribe()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn transcribe( - bridge: &PythonBridge, - audio_data: &[u8], - mime_type: &str, - params: &TranscribeParams, -) -> Result, Error> { - let audio_data = audio_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("transcribe", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("audio_bytes", &audio_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item( - "enable_speaker_diarization", - params.enable_speaker_diarization, - ) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `transcribe()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn transcribe_async( - bridge: &PythonBridge, - audio_data: &[u8], - mime_type: &str, - params: &TranscribeParams, -) -> Result, Error> { - let audio_data = audio_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("transcribe", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("audio_bytes", &audio_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item( - "enable_speaker_diarization", - params.enable_speaker_diarization, - ) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} From 159809550a6672510532f23bd299bf13d2fc3bfa Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 13 Mar 2026 11:46:51 +0100 Subject: [PATCH 10/10] style(python): move rustdoc link definition to bottom of doc comment Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-python/src/exif/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/nvisy-python/src/exif/mod.rs b/crates/nvisy-python/src/exif/mod.rs index 5c7975e..7eac7d8 100644 --- a/crates/nvisy-python/src/exif/mod.rs +++ b/crates/nvisy-python/src/exif/mod.rs @@ -1,9 +1,11 @@ //! EXIF metadata extraction via the Python backend. //! //! Provides [`ExifModule`]: a configured handle that calls -//! `nvisy_ai.extract_exif()` through the [`PythonBridge`](crate::bridge::PythonBridge) +//! `nvisy_ai.extract_exif()` through the [`PythonBridge`] //! to extract EXIF metadata from images. Returns raw JSON values: //! metadata construction is handled by the caller. +//! +//! [`PythonBridge`]: crate::bridge::PythonBridge mod module; mod params;