diff --git a/Cargo.lock b/Cargo.lock index ab4a72a0..20460fcd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3045,6 +3045,7 @@ dependencies = [ "regex", "serde", "serde_json", + "tempfile", "thiserror 2.0.18", "tracing", ] @@ -3060,6 +3061,7 @@ dependencies = [ "pythonize", "serde_json", "tokio", + "tracing", ] [[package]] @@ -3074,6 +3076,7 @@ dependencies = [ "serde_json", "tempfile", "tokio", + "tracing", "uuid", ] diff --git a/crates/nvisy-cli/src/config/mod.rs b/crates/nvisy-cli/src/config/mod.rs index 32ae3a3b..f09f45b8 100644 --- a/crates/nvisy-cli/src/config/mod.rs +++ b/crates/nvisy-cli/src/config/mod.rs @@ -31,11 +31,12 @@ mod server; use std::path::PathBuf; use clap::Parser; -pub use file::MiddlewareSection; use nvisy_engine::RuntimeConfig; -pub use server::{ResolvedServer, ServerConfig}; use tracing_subscriber::EnvFilter; +pub use self::file::MiddlewareSection; +pub use self::server::{ResolvedServer, ServerConfig}; + /// Top-level CLI entry point. /// /// Parses command-line arguments and loads the TOML configuration file. diff --git a/crates/nvisy-cli/src/server/mod.rs b/crates/nvisy-cli/src/server/mod.rs index 8f1510f8..7ae6dbc2 100644 --- a/crates/nvisy-cli/src/server/mod.rs +++ b/crates/nvisy-cli/src/server/mod.rs @@ -3,4 +3,4 @@ mod listen; mod shutdown; -pub use listen::run; +pub use self::listen::run; diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index c8303dea..fdfb5a74 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -9,9 +9,9 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{ AudioFormat, DocumentType, ImageFormat, SpreadsheetFormat, TextFormat, WordFormat, }; -pub use span::Span; -pub use stream::SpanStream; +pub use self::span::Span; +pub use self::stream::SpanStream; use crate::handler::{ BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, HtmlLoader, HtmlParams, JpegLoader, JpegParams, JsonLoader, JsonParams, Loader, diff --git a/crates/nvisy-codec/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs index 8a3825e1..0a58386e 100644 --- a/crates/nvisy-codec/src/handler/audio/mod.rs +++ b/crates/nvisy-codec/src/handler/audio/mod.rs @@ -14,14 +14,14 @@ mod mp3_loader; mod wav_handler; mod wav_loader; -pub use audio_data::AudioData; -pub use audio_handler::BoxedAudioHandler; -use audio_handler_macro::impl_audio_handler; -pub use audio_span_id::AudioSpanId; -pub use mp3_handler::Mp3Handler; -pub use mp3_loader::{Mp3Loader, Mp3Params}; -pub use wav_handler::WavHandler; -pub use wav_loader::{WavLoader, WavParams}; +pub use self::audio_data::AudioData; +pub use self::audio_handler::BoxedAudioHandler; +use self::audio_handler_macro::impl_audio_handler; +pub use self::audio_span_id::AudioSpanId; +pub use self::mp3_handler::Mp3Handler; +pub use self::mp3_loader::{Mp3Loader, Mp3Params}; +pub use self::wav_handler::WavHandler; +pub use self::wav_loader::{WavLoader, WavParams}; /// Capability trait for handlers that expose audio content. /// diff --git a/crates/nvisy-codec/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs index 8f1063f2..140ade5d 100644 --- a/crates/nvisy-codec/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -16,14 +16,14 @@ mod jpeg_loader; mod png_handler; mod png_loader; -pub use image_data::ImageData; -pub use image_handler::BoxedImageHandler; -pub(crate) use image_handler_macro::impl_image_handler; -pub use image_span_id::ImageSpanId; -pub use jpeg_handler::JpegHandler; -pub use jpeg_loader::{JpegLoader, JpegParams}; -pub use png_handler::PngHandler; -pub use png_loader::{PngLoader, PngParams}; +pub use self::image_data::ImageData; +pub use self::image_handler::BoxedImageHandler; +pub(crate) use self::image_handler_macro::impl_image_handler; +pub use self::image_span_id::ImageSpanId; +pub use self::jpeg_handler::JpegHandler; +pub use self::jpeg_loader::{JpegLoader, JpegParams}; +pub use self::png_handler::PngHandler; +pub use self::png_loader::{PngLoader, PngParams}; /// Capability trait for handlers that expose image content. /// diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index e9cba021..95de961c 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -17,11 +17,12 @@ mod image; mod rich; mod text; -pub use audio::*; -pub use image::*; use nvisy_core::content::ContentSource; -pub use rich::*; -pub use text::*; + +pub use self::audio::*; +pub use self::image::*; +pub use self::rich::*; +pub use self::text::*; /// Base trait implemented by all format handlers. /// diff --git a/crates/nvisy-codec/src/handler/rich/mod.rs b/crates/nvisy-codec/src/handler/rich/mod.rs index aa1a7e0d..a8aaede1 100644 --- a/crates/nvisy-codec/src/handler/rich/mod.rs +++ b/crates/nvisy-codec/src/handler/rich/mod.rs @@ -13,9 +13,9 @@ mod docx_loader; mod rich_handler; #[cfg(feature = "docx")] -pub use docx_loader::{DocxLoader, DocxParams}; +pub use self::docx_loader::{DocxLoader, DocxParams}; #[cfg(feature = "pdf")] -pub use pdf_handler::{RichTextHandler, RichTextSpan}; +pub use self::pdf_handler::{RichTextHandler, RichTextSpan}; #[cfg(feature = "pdf")] -pub use pdf_loader::{PdfLoader, PdfParams}; -pub use rich_handler::BoxedRichHandler; +pub use self::pdf_loader::{PdfLoader, PdfParams}; +pub use self::rich_handler::BoxedRichHandler; diff --git a/crates/nvisy-codec/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs index c9c0b3cf..80e1693b 100644 --- a/crates/nvisy-codec/src/handler/text/mod.rs +++ b/crates/nvisy-codec/src/handler/text/mod.rs @@ -25,22 +25,22 @@ mod xlsx_handler; #[cfg(feature = "xlsx")] mod xlsx_loader; -pub use csv_handler::{CsvData, CsvHandler, CsvSpan}; -pub use csv_loader::{CsvLoader, CsvParams}; +pub use self::csv_handler::{CsvData, CsvHandler, CsvSpan}; +pub use self::csv_loader::{CsvLoader, CsvParams}; #[cfg(feature = "html")] -pub use html_handler::{HtmlData, HtmlHandler, HtmlSpan}; +pub use self::html_handler::{HtmlData, HtmlHandler, HtmlSpan}; #[cfg(feature = "html")] -pub use html_loader::{HtmlLoader, HtmlParams}; -pub use json_handler::{JsonData, JsonHandler, JsonIndent, JsonPath}; -pub use json_loader::{JsonLoader, JsonParams}; -pub use text_data::TextData; -pub use text_handler::BoxedTextHandler; -pub use txt_handler::{TxtHandler, TxtSpan}; -pub use txt_loader::{TxtLoader, TxtParams}; +pub use self::html_loader::{HtmlLoader, HtmlParams}; +pub use self::json_handler::{JsonData, JsonHandler, JsonIndent, JsonPath}; +pub use self::json_loader::{JsonLoader, JsonParams}; +pub use self::text_data::TextData; +pub use self::text_handler::BoxedTextHandler; +pub use self::txt_handler::{TxtHandler, TxtSpan}; +pub use self::txt_loader::{TxtLoader, TxtParams}; #[cfg(feature = "xlsx")] -pub use xlsx_handler::XlsxHandler; +pub use self::xlsx_handler::XlsxHandler; #[cfg(feature = "xlsx")] -pub use xlsx_loader::{XlsxLoader, XlsxParams}; +pub use self::xlsx_loader::{XlsxLoader, XlsxParams}; /// Capability trait for handlers that expose text content. /// diff --git a/crates/nvisy-codec/src/lib.rs b/crates/nvisy-codec/src/lib.rs index 4bf99170..90f92dc8 100644 --- a/crates/nvisy-codec/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -6,7 +6,7 @@ mod document; pub mod handler; pub mod transform; -pub use document::{Document, Span, SpanStream}; +pub use self::document::{Document, Span, SpanStream}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-codec/src/transform/audio/mod.rs b/crates/nvisy-codec/src/transform/audio/mod.rs index d45e2a81..3c7d3a8a 100644 --- a/crates/nvisy-codec/src/transform/audio/mod.rs +++ b/crates/nvisy-codec/src/transform/audio/mod.rs @@ -3,5 +3,5 @@ mod instruction; mod transform; -pub use instruction::{AudioOutput, AudioRedaction}; -pub use transform::AudioTransform; +pub use self::instruction::{AudioOutput, AudioRedaction}; +pub use self::transform::AudioTransform; diff --git a/crates/nvisy-codec/src/transform/image/mod.rs b/crates/nvisy-codec/src/transform/image/mod.rs index e40f4129..8ff18a98 100644 --- a/crates/nvisy-codec/src/transform/image/mod.rs +++ b/crates/nvisy-codec/src/transform/image/mod.rs @@ -4,5 +4,5 @@ mod instruction; mod ops; mod transform; -pub use instruction::{ImageOutput, ImageRedaction}; -pub use transform::ImageTransform; +pub use self::instruction::{ImageOutput, ImageRedaction}; +pub use self::transform::ImageTransform; diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs index 98f67a6c..823d80ae 100644 --- a/crates/nvisy-codec/src/transform/mod.rs +++ b/crates/nvisy-codec/src/transform/mod.rs @@ -4,6 +4,6 @@ mod audio; mod image; mod text; -pub use audio::{AudioOutput, AudioRedaction, AudioTransform}; -pub use image::{ImageOutput, ImageRedaction, ImageTransform}; -pub use text::{TextOutput, TextRedaction, TextTransform}; +pub use self::audio::{AudioOutput, AudioRedaction, AudioTransform}; +pub use self::image::{ImageOutput, ImageRedaction, ImageTransform}; +pub use self::text::{TextOutput, TextRedaction, TextTransform}; diff --git a/crates/nvisy-codec/src/transform/text/mod.rs b/crates/nvisy-codec/src/transform/text/mod.rs index 4235678f..0993a6d9 100644 --- a/crates/nvisy-codec/src/transform/text/mod.rs +++ b/crates/nvisy-codec/src/transform/text/mod.rs @@ -3,5 +3,5 @@ mod instruction; mod transform; -pub use instruction::{TextOutput, TextRedaction}; -pub use transform::TextTransform; +pub use self::instruction::{TextOutput, TextRedaction}; +pub use self::transform::TextTransform; diff --git a/crates/nvisy-core/src/content/mod.rs b/crates/nvisy-core/src/content/mod.rs index 7783432e..26e4bb29 100644 --- a/crates/nvisy-core/src/content/mod.rs +++ b/crates/nvisy-core/src/content/mod.rs @@ -13,10 +13,10 @@ mod data_reference; mod encoding; mod source; -pub use bundle::Content; -pub use content_bytes::ContentBytes; -pub use content_data::ContentData; -pub use content_metadata::ContentMetadata; -pub use data_reference::DataReference; -pub use encoding::TextEncoding; -pub use source::ContentSource; +pub use self::bundle::Content; +pub use self::content_bytes::ContentBytes; +pub use self::content_data::ContentData; +pub use self::content_metadata::ContentMetadata; +pub use self::data_reference::DataReference; +pub use self::encoding::TextEncoding; +pub use self::source::ContentSource; diff --git a/crates/nvisy-core/src/error.rs b/crates/nvisy-core/src/error.rs index 063cd422..1a50448a 100644 --- a/crates/nvisy-core/src/error.rs +++ b/crates/nvisy-core/src/error.rs @@ -44,7 +44,7 @@ pub struct Error { /// Human-readable description of what went wrong. pub message: String, /// Name of the component that produced this error (e.g. `"s3-read"`, `"detect-regex"`). - pub source_component: Option, + pub component: Option, /// Whether the operation that failed can be safely retried. pub retryable: bool, /// The underlying cause, if any. @@ -58,7 +58,7 @@ impl Error { Self { kind, message: message.into(), - source_component: None, + component: None, retryable: false, source: None, } @@ -72,7 +72,7 @@ impl Error { /// Tag this error with the name of the component that produced it. pub fn with_component(mut self, component: impl Into) -> Self { - self.source_component = Some(component.into()); + self.component = Some(component.into()); self } diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 8e0e6344..ef2c9136 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -7,7 +7,7 @@ pub mod math; pub mod media; mod error; -pub use error::{Error, ErrorKind, Result}; +pub use self::error::{Error, ErrorKind, Result}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/math/mod.rs b/crates/nvisy-core/src/math/mod.rs index 0c677122..69fa2091 100644 --- a/crates/nvisy-core/src/math/mod.rs +++ b/crates/nvisy-core/src/math/mod.rs @@ -8,7 +8,7 @@ mod dpi; mod polygon; mod time_span; -pub use bounding_box::{BoundingBox, BoundingBoxPixel}; -pub use dpi::Dpi; -pub use polygon::{Polygon, Vertex}; -pub use time_span::TimeSpan; +pub use self::bounding_box::{BoundingBox, BoundingBoxPixel}; +pub use self::dpi::Dpi; +pub use self::polygon::{Polygon, Vertex}; +pub use self::time_span::TimeSpan; diff --git a/crates/nvisy-core/src/media/mod.rs b/crates/nvisy-core/src/media/mod.rs index 68ba8983..e6adb9f8 100644 --- a/crates/nvisy-core/src/media/mod.rs +++ b/crates/nvisy-core/src/media/mod.rs @@ -6,8 +6,8 @@ mod content_kind; mod document_type; -pub use content_kind::ContentKind; -pub use document_type::{ +pub use self::content_kind::ContentKind; +pub use self::document_type::{ AudioFormat, DocumentType, ImageFormat, PresentationFormat, SpreadsheetFormat, TextFormat, WordFormat, }; diff --git a/crates/nvisy-engine/src/compiler/graph/mod.rs b/crates/nvisy-engine/src/compiler/graph/mod.rs index d8d5310c..221de0b1 100644 --- a/crates/nvisy-engine/src/compiler/graph/mod.rs +++ b/crates/nvisy-engine/src/compiler/graph/mod.rs @@ -11,15 +11,15 @@ mod target; use std::collections::HashSet; -pub use action::{ActionKind, ActionNode}; use nvisy_core::Error; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use source::SourceNode; -pub use target::TargetNode; use uuid::Uuid; use validator::Validate; +pub use self::action::{ActionKind, ActionNode}; +pub use self::source::SourceNode; +pub use self::target::TargetNode; use super::policy::{RetryPolicy, TimeoutPolicy}; /// A node in the pipeline graph. diff --git a/crates/nvisy-engine/src/compiler/mod.rs b/crates/nvisy-engine/src/compiler/mod.rs index 9ce2b6ae..6ac7baa2 100644 --- a/crates/nvisy-engine/src/compiler/mod.rs +++ b/crates/nvisy-engine/src/compiler/mod.rs @@ -10,16 +10,17 @@ mod policy; use std::collections::HashMap; -pub use graph::{ - ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, -}; use nvisy_core::Error; use petgraph::algo::{is_cyclic_directed, toposort}; use petgraph::graph::{DiGraph, NodeIndex}; -pub(crate) use plan::{ExecutionPlan, ResolvedNode}; -pub use policy::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; use uuid::Uuid; +pub use self::graph::{ + ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, +}; +pub(crate) use self::plan::{ExecutionPlan, ResolvedNode}; +pub use self::policy::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; + /// Pipeline compiler with optional default policies. /// /// Nodes that don't carry their own retry or timeout policy will inherit diff --git a/crates/nvisy-engine/src/compiler/policy/mod.rs b/crates/nvisy-engine/src/compiler/policy/mod.rs index 13adfb43..769e82f5 100644 --- a/crates/nvisy-engine/src/compiler/policy/mod.rs +++ b/crates/nvisy-engine/src/compiler/policy/mod.rs @@ -3,5 +3,5 @@ mod retry; mod timeout; -pub use retry::{BackoffStrategy, RetryPolicy}; -pub use timeout::{TimeoutBehavior, TimeoutPolicy}; +pub use self::retry::{BackoffStrategy, RetryPolicy}; +pub use self::timeout::{TimeoutBehavior, TimeoutPolicy}; diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index cbe92933..08727136 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -8,11 +8,11 @@ pub mod pipeline; pub mod provenance; // Re-export graph data model for pipeline definitions. -pub use compiler::{ +pub use self::compiler::{ ActionKind, ActionNode, Graph, GraphEdge, GraphNode, GraphNodeKind, SourceNode, TargetNode, }; // Re-export retry and timeout policies for pipeline nodes. -pub use compiler::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; -pub use pipeline::{ +pub use self::compiler::{BackoffStrategy, RetryPolicy, TimeoutBehavior, TimeoutPolicy}; +pub use self::pipeline::{ DefaultEngine, EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection, }; diff --git a/crates/nvisy-engine/src/operation/context/mod.rs b/crates/nvisy-engine/src/operation/context/mod.rs index ccbfc2fc..01cffa9f 100644 --- a/crates/nvisy-engine/src/operation/context/mod.rs +++ b/crates/nvisy-engine/src/operation/context/mod.rs @@ -19,15 +19,13 @@ //! [`Operation::Input`]: crate::operation::Operation::Input //! [`Operation::Output`]: crate::operation::Operation::Output -mod envelope; mod parallel; mod sequential; mod shared; -pub use envelope::DocumentEnvelope; -pub use parallel::ParallelContext; -pub use sequential::SequentialContext; -pub use shared::SharedContext; +pub use self::parallel::ParallelContext; +pub use self::sequential::SequentialContext; +pub use self::shared::SharedContext; pub(crate) mod private { pub trait Sealed {} diff --git a/crates/nvisy-engine/src/operation/envelope/apply.rs b/crates/nvisy-engine/src/operation/envelope/apply.rs new file mode 100644 index 00000000..3e05a6c5 --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/apply.rs @@ -0,0 +1,43 @@ +//! The [`ApplyPatch`] trait and blanket implementations. + +use super::DocumentEnvelope; + +/// A value that can be applied to a [`DocumentEnvelope`], merging +/// operation results into the shared pipeline state. +/// +/// Each operation returns a concrete patch type; the orchestrator +/// calls [`apply`](ApplyPatch::apply) to fold it into the envelope +/// without needing to know the operation's internals. +pub trait ApplyPatch { + /// Merge this patch into the envelope. + fn apply(self, envelope: &mut DocumentEnvelope); +} + +/// A no-op patch for operations that don't modify the envelope. +impl ApplyPatch for () { + fn apply(self, _envelope: &mut DocumentEnvelope) {} +} + +/// Apply multiple patches of the same type in sequence. +impl ApplyPatch for Vec

{ + fn apply(self, envelope: &mut DocumentEnvelope) { + for patch in self { + patch.apply(envelope); + } + } +} + +impl ApplyPatch for (A, B) { + fn apply(self, envelope: &mut DocumentEnvelope) { + self.0.apply(envelope); + self.1.apply(envelope); + } +} + +impl ApplyPatch for (A, B, C) { + fn apply(self, envelope: &mut DocumentEnvelope) { + self.0.apply(envelope); + self.1.apply(envelope); + self.2.apply(envelope); + } +} diff --git a/crates/nvisy-engine/src/operation/envelope/audit.rs b/crates/nvisy-engine/src/operation/envelope/audit.rs new file mode 100644 index 00000000..2b5110ba --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/audit.rs @@ -0,0 +1,20 @@ +//! Audit entry patches. + +use super::DocumentEnvelope; +use super::apply::ApplyPatch; +use crate::provenance::AuditEntry; + +/// A single audit log entry recording what an operation did. +pub struct OperationEntry(pub AuditEntry); + +impl ApplyPatch for OperationEntry { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.push_entry(self.0); + } +} + +impl ApplyPatch for AuditEntry { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.push_entry(self); + } +} diff --git a/crates/nvisy-engine/src/operation/envelope/detection.rs b/crates/nvisy-engine/src/operation/envelope/detection.rs new file mode 100644 index 00000000..144969f5 --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/detection.rs @@ -0,0 +1,30 @@ +//! Entity detection patches. + +use nvisy_ontology::entity::Entities; + +use super::DocumentEnvelope; +use super::apply::ApplyPatch; + +/// New entities discovered by a detection operation (NER, OCR, CV, +/// pattern match, manual annotation). +/// +/// Appended to the envelope's existing entity set. +pub struct DetectedEntities(pub Entities); + +impl ApplyPatch for DetectedEntities { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.entities.extend(self.0); + } +} + +/// A fully recomputed entity set produced by refinement operations +/// (deduplication, ensemble fusion). +/// +/// Replaces the envelope's entity set entirely. +pub struct RefinedEntities(pub Entities); + +impl ApplyPatch for RefinedEntities { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.entities = self.0; + } +} diff --git a/crates/nvisy-engine/src/operation/context/envelope.rs b/crates/nvisy-engine/src/operation/envelope/mod.rs similarity index 80% rename from crates/nvisy-engine/src/operation/context/envelope.rs rename to crates/nvisy-engine/src/operation/envelope/mod.rs index 0624a163..ff211db9 100644 --- a/crates/nvisy-engine/src/operation/context/envelope.rs +++ b/crates/nvisy-engine/src/operation/envelope/mod.rs @@ -9,19 +9,31 @@ //! ContentData //! ↓ Import //! DocumentEnvelope { document, … } -//! ↓ OCR / NER / CV / PatternMatch +//! ↓ OCR / NER / CV / PatternMatch → DetectedEntities //! DocumentEnvelope { document, entities, … } -//! ↓ Deduplication / Ensemble +//! ↓ Deduplication / Ensemble → RefinedEntities //! DocumentEnvelope { document, entities (merged), … } -//! ↓ PolicyEvaluation +//! ↓ PolicyEvaluation → PolicyOutcome //! DocumentEnvelope { document, entities, audit { decisions, records }, … } //! ↓ Redaction //! DocumentEnvelope { document (redacted), entities, audit { … } } //! ``` +//! +//! Operations produce typed patch values that implement [`ApplyPatch`]. +//! The orchestrator merges each patch via [`DocumentEnvelope::apply`]. + +mod apply; +mod audit; +mod detection; +mod policy; use nvisy_codec::Document; use nvisy_ontology::entity::Entities; +pub use self::apply::ApplyPatch; +pub use self::audit::OperationEntry; +pub use self::detection::{DetectedEntities, RefinedEntities}; +pub use self::policy::PolicyOutcome; use crate::provenance::Audit; /// Per-document state that flows through the entire pipeline. @@ -69,6 +81,11 @@ impl DocumentEnvelope { pub fn entity_count(&self) -> usize { self.entities.len() } + + /// Merge an operation's output into this envelope. + pub fn apply(&mut self, patch: impl ApplyPatch) { + patch.apply(self); + } } impl std::fmt::Debug for DocumentEnvelope { diff --git a/crates/nvisy-engine/src/operation/envelope/policy.rs b/crates/nvisy-engine/src/operation/envelope/policy.rs new file mode 100644 index 00000000..078dac4e --- /dev/null +++ b/crates/nvisy-engine/src/operation/envelope/policy.rs @@ -0,0 +1,20 @@ +//! Policy evaluation patches. + +use super::DocumentEnvelope; +use super::apply::ApplyPatch; +use crate::provenance::{RedactionDecision, RedactionRecord}; + +/// Redaction decisions and audit records produced by policy evaluation. +pub struct PolicyOutcome { + /// How each entity should be redacted. + pub decisions: Vec, + /// Audit-facing records of what was decided. + pub records: Vec, +} + +impl ApplyPatch for PolicyOutcome { + fn apply(self, envelope: &mut DocumentEnvelope) { + envelope.audit.decisions.extend(self.decisions); + envelope.audit.records.extend(self.records); + } +} diff --git a/crates/nvisy-engine/src/operation/inference/computer_vision.rs b/crates/nvisy-engine/src/operation/inference/computer_vision.rs index 1650fb9c..5ad449ba 100644 --- a/crates/nvisy-engine/src/operation/inference/computer_vision.rs +++ b/crates/nvisy-engine/src/operation/inference/computer_vision.rs @@ -7,9 +7,10 @@ use nvisy_codec::Span; use nvisy_codec::handler::ImageData; use nvisy_core::math::BoundingBox; use nvisy_core::{Error, Result}; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, ImageLocation}; +use nvisy_ontology::entity::{Entity, ExtractionMethod, ImageLocation, RecognitionMethod}; use nvisy_rig::agent::{CvAgent, CvEntity, DetectionConfig}; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::computer_vision"; @@ -26,7 +27,7 @@ impl ComputerVision { Self { agent, config } } - async fn detect(&self, spans: Vec>) -> Result { + async fn detect(&self, spans: Vec>) -> Result { tracing::debug!(target: TARGET, span_count = spans.len(), "detecting entities"); let mut entities = Vec::new(); @@ -45,13 +46,13 @@ impl ComputerVision { } } - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for ComputerVision { type Input = ParallelContext>>; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|spans| self.detect(spans)).await @@ -60,14 +61,15 @@ impl Operation for ComputerVision { /// Convert a [`CvEntity`] to an [`Entity`] with [`ImageLocation`]. fn map_cv_entity(cv: &CvEntity) -> Entity { - Entity::new( - cv.category.clone(), + let mut entity = Entity::new( + cv.category, cv.entity_type, &cv.label, - DetectionMethod::ObjectDetection, + RecognitionMethod::Classification, cv.confidence, - ) - .with_location( + ); + entity.extraction_methods = vec![ExtractionMethod::ObjectDetection]; + entity.with_location( ImageLocation { bounding_box: BoundingBox { x: cv.bbox[0], diff --git a/crates/nvisy-engine/src/operation/inference/mod.rs b/crates/nvisy-engine/src/operation/inference/mod.rs index 6891511e..5a99c2cb 100644 --- a/crates/nvisy-engine/src/operation/inference/mod.rs +++ b/crates/nvisy-engine/src/operation/inference/mod.rs @@ -27,11 +27,11 @@ mod summarization; mod transcription; mod translation; -pub use classification::Classification; -pub use computer_vision::ComputerVision; -pub use ner::{Ner, NerMethodParams}; -pub use ocr::Ocr; -pub use ocr_verification::{OcrVerification, OcrVerificationInput}; -pub use summarization::Summarization; -pub use transcription::Transcription; -pub use translation::Translation; +pub use self::classification::Classification; +pub use self::computer_vision::ComputerVision; +pub use self::ner::{Ner, NerMethodParams}; +pub use self::ocr::Ocr; +pub use self::ocr_verification::{OcrVerification, OcrVerificationInput}; +pub use self::summarization::Summarization; +pub use self::transcription::Transcription; +pub use self::translation::Translation; diff --git a/crates/nvisy-engine/src/operation/inference/ner.rs b/crates/nvisy-engine/src/operation/inference/ner.rs index 5db48803..0e350271 100644 --- a/crates/nvisy-engine/src/operation/inference/ner.rs +++ b/crates/nvisy-engine/src/operation/inference/ner.rs @@ -8,15 +8,14 @@ use nvisy_codec::Span; use nvisy_codec::handler::TxtSpan; use nvisy_core::{Error, Result}; use nvisy_http::HttpClient; -use nvisy_ontology::entity::{ - DetectionMethod, Entities, Entity, EntityCategory, EntityKind, TextLocation, -}; +use nvisy_ontology::entity::{Entity, EntityCategory, EntityKind, RecognitionMethod, TextLocation}; use nvisy_rig::agent::{ AgentConfig, AgentProvider, DetectionConfig, KnownNerEntity, NerAgent, NerContext, }; use serde::Deserialize; use tokio::sync::Mutex; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, SequentialContext}; const TARGET: &str = "nvisy_engine::op::ner"; @@ -100,7 +99,7 @@ impl Ner { state.known_entities.clear(); } - async fn detect(&self, spans: Vec>) -> Result { + async fn detect(&self, spans: Vec>) -> Result { tracing::debug!(target: TARGET, span_count = spans.len(), "running NER"); let mut entities = Vec::new(); @@ -119,7 +118,7 @@ impl Ner { for ner_entity in &ner_entities { let category: EntityCategory = match ner_entity.category { - Some(ref c) => c.clone(), + Some(c) => c, None => continue, }; let entity_kind = match ner_entity.entity_type { @@ -135,7 +134,7 @@ impl Ner { category, entity_kind, &ner_entity.value, - DetectionMethod::Ner, + RecognitionMethod::Ner, confidence, ); @@ -169,13 +168,13 @@ impl Ner { state.known_entities = merge_ctx.known_entities; } - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for Ner { type Input = SequentialContext>>; - type Output = SequentialContext; + type Output = SequentialContext; async fn call(&self, input: Self::Input) -> Result { input.sequential_map(|spans| self.detect(spans)).await diff --git a/crates/nvisy-engine/src/operation/inference/ocr_verification.rs b/crates/nvisy-engine/src/operation/inference/ocr_verification.rs index d71df01a..d5029ccf 100644 --- a/crates/nvisy-engine/src/operation/inference/ocr_verification.rs +++ b/crates/nvisy-engine/src/operation/inference/ocr_verification.rs @@ -12,6 +12,7 @@ use nvisy_core::{Error, Result}; use nvisy_ontology::entity::Entities; use nvisy_rig::agent::OcrAgent; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::ocr_verification"; @@ -38,16 +39,16 @@ impl OcrVerification { Self { agent } } - async fn verify(&self, data: OcrVerificationInput) -> Result { + async fn verify(&self, data: OcrVerificationInput) -> Result { if data.entities.is_empty() { tracing::debug!(target: TARGET, "no entities to verify"); - return Ok(Entities::new()); + return Ok(DetectedEntities(Entities::new())); } tracing::debug!(target: TARGET, entity_count = data.entities.len(), "verifying entities"); let image_bytes = match data.image_spans.first() { Some(span) => span.data.encode_png()?, - None => return Ok(data.entities), + None => return Ok(DetectedEntities(data.entities)), }; let entities = self @@ -56,13 +57,13 @@ impl OcrVerification { .await .map_err(|e| Error::runtime(e.to_string(), "ocr-verification", e.is_retryable()))?; - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } impl Operation for OcrVerification { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.verify(data)).await diff --git a/crates/nvisy-engine/src/operation/lifecycle/encryption.rs b/crates/nvisy-engine/src/operation/lifecycle/encryption.rs index 4fcb843b..35cd0102 100644 --- a/crates/nvisy-engine/src/operation/lifecycle/encryption.rs +++ b/crates/nvisy-engine/src/operation/lifecycle/encryption.rs @@ -7,7 +7,7 @@ use aes_gcm::{Aes256Gcm, KeyInit, Nonce}; use nvisy_core::{Error, ErrorKind, Result}; use rand::RngExt; -use crate::operation::context::DocumentEnvelope; +use crate::operation::envelope::DocumentEnvelope; use crate::operation::utility::crypto::{ EncryptedContent, EncryptionAlgorithm, KeyProvider, NONCE_SIZE, WireEnvelope, }; diff --git a/crates/nvisy-engine/src/operation/lifecycle/mod.rs b/crates/nvisy-engine/src/operation/lifecycle/mod.rs index d6b0bb4d..65d17a00 100644 --- a/crates/nvisy-engine/src/operation/lifecycle/mod.rs +++ b/crates/nvisy-engine/src/operation/lifecycle/mod.rs @@ -24,10 +24,10 @@ mod encryption; mod export; mod import; -pub use compression::Compression; -pub use conversion::Conversion; -pub use decompression::Decompression; -pub use decryption::Decryption; -pub use encryption::Encryption; -pub use export::Export; -pub use import::Import; +pub use self::compression::Compression; +pub use self::conversion::Conversion; +pub use self::decompression::Decompression; +pub use self::decryption::Decryption; +pub use self::encryption::Encryption; +pub use self::export::Export; +pub use self::import::Import; diff --git a/crates/nvisy-engine/src/operation/mod.rs b/crates/nvisy-engine/src/operation/mod.rs index 1e8e20a2..14473fd5 100644 --- a/crates/nvisy-engine/src/operation/mod.rs +++ b/crates/nvisy-engine/src/operation/mod.rs @@ -15,6 +15,7 @@ //! | Lifecycle | [`lifecycle`] | Content I/O (import, export, encrypt) | mod context; +pub mod envelope; pub mod inference; pub mod lifecycle; pub mod processing; @@ -22,11 +23,11 @@ pub mod utility; use std::future::Future; -pub use context::{ - DocumentEnvelope, OperationContext, ParallelContext, SequentialContext, SharedContext, -}; use nvisy_core::Result; +pub use self::context::{OperationContext, ParallelContext, SequentialContext, SharedContext}; +pub use self::envelope::DocumentEnvelope; + /// A single unit of work in the redaction pipeline. /// /// Operations are stateless and composable. The engine calls [`Operation::call`] diff --git a/crates/nvisy-engine/src/operation/processing/deduplication.rs b/crates/nvisy-engine/src/operation/processing/deduplication.rs index 6a8a37d1..23c5b18b 100644 --- a/crates/nvisy-engine/src/operation/processing/deduplication.rs +++ b/crates/nvisy-engine/src/operation/processing/deduplication.rs @@ -2,11 +2,12 @@ //! //! Merges entities that share the same `entity_kind`, `value`, and //! overlapping location into a single entity with the highest -//! confidence and `DetectionMethod::Composite` when methods differ. +//! confidence and combined recognition methods. use nvisy_core::Result; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use nvisy_ontology::entity::{Entities, Entity, Location, RefinementMethod}; +use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::deduplication"; @@ -18,16 +19,16 @@ const TARGET: &str = "nvisy_engine::op::deduplication"; /// /// When merging: /// - The highest confidence score is kept. -/// - If the detection methods differ, the merged entity uses -/// `DetectionMethod::Composite`. +/// - Recognition methods are combined into an ordered vector. +/// - [`RefinementMethod::Deduplication`] is recorded on the merged entity. pub struct Deduplication; impl Deduplication { - async fn deduplicate(&self, entities: Entities) -> Result { + async fn deduplicate(&self, entities: Entities) -> Result { let before = entities.len(); let result = Self::execute(entities); tracing::debug!(target: TARGET, before, after = result.len(), "deduplicated entities"); - Ok(result) + Ok(RefinedEntities(result)) } /// Deduplicate and merge overlapping entities. @@ -50,8 +51,18 @@ impl Deduplication { if entity.confidence > existing.confidence { existing.confidence = entity.confidence; } - if existing.detection_method != entity.detection_method { - existing.detection_method = DetectionMethod::Composite; + for m in entity.recognition_methods { + if !existing.recognition_methods.contains(&m) { + existing.recognition_methods.push(m); + } + } + if !existing + .refinement_methods + .contains(&RefinementMethod::Deduplication) + { + existing + .refinement_methods + .push(RefinementMethod::Deduplication); } } None => { @@ -66,7 +77,7 @@ impl Deduplication { impl Operation for Deduplication { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.deduplicate(data)).await @@ -87,19 +98,19 @@ fn locations_overlap(a: &Option, b: &Option) -> bool { #[cfg(test)] mod tests { - use nvisy_ontology::entity::{EntityCategory, EntityKind, TextLocation}; + use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod, TextLocation}; use super::*; fn text_entity( value: &str, - method: DetectionMethod, + method: RecognitionMethod, confidence: f64, start: usize, end: usize, ) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, method, @@ -118,34 +129,44 @@ mod tests { #[test] fn duplicates_merged_same_method() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Regex, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.9, 0, 4), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.9).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Regex); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex] + ); + assert_eq!( + result[0].refinement_methods, + vec![RefinementMethod::Deduplication] + ); } #[test] - fn different_methods_become_composite() { + fn different_methods_are_combined() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.85, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.85, 0, 4), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); assert!((result[0].confidence - 0.85).abs() < f64::EPSILON); } #[test] fn non_overlapping_preserved() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("John", DetectionMethod::Regex, 0.9, 10, 14), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.9, 10, 14), ] .into(); let result = Deduplication::execute(entities); @@ -155,8 +176,8 @@ mod tests { #[test] fn different_values_not_merged() { let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.8, 0, 4), - text_entity("Jane", DetectionMethod::Regex, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4), + text_entity("Jane", RecognitionMethod::Regex, 0.9, 0, 4), ] .into(); let result = Deduplication::execute(entities); @@ -172,7 +193,7 @@ mod tests { #[test] fn single_entity_unchanged() { let entities: Entities = - vec![text_entity("John", DetectionMethod::Regex, 0.8, 0, 4)].into(); + vec![text_entity("John", RecognitionMethod::Regex, 0.8, 0, 4)].into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); } @@ -181,12 +202,15 @@ mod tests { fn overlapping_ranges_merge() { // Partially overlapping: 0..6 and 3..9. let entities: Entities = vec![ - text_entity("John Doe", DetectionMethod::Regex, 0.7, 0, 6), - text_entity("John Doe", DetectionMethod::Ner, 0.9, 3, 9), + text_entity("John Doe", RecognitionMethod::Regex, 0.7, 0, 6), + text_entity("John Doe", RecognitionMethod::Ner, 0.9, 3, 9), ] .into(); let result = Deduplication::execute(entities); assert_eq!(result.len(), 1); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); } } diff --git a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs index 75155a55..4dbbaa04 100644 --- a/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs +++ b/crates/nvisy-engine/src/operation/processing/ensemble_fusion.rs @@ -4,8 +4,9 @@ use std::collections::HashMap; use nvisy_core::Result; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, Location}; +use nvisy_ontology::entity::{Entities, Entity, Location, RecognitionMethod, RefinementMethod}; +use crate::operation::envelope::RefinedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::ensemble"; @@ -15,9 +16,9 @@ const TARGET: &str = "nvisy_engine::op::ensemble"; pub enum FusionStrategy { /// Take the maximum confidence across all detectors. MaxConfidence, - /// Weighted average by detection method. + /// Weighted average by recognition method. WeightedAverage { - weights: HashMap, + weights: HashMap, }, /// Noisy-OR: `P = 1 − ∏(1 − pᵢ)` for independent detectors. NoisyOr, @@ -35,11 +36,11 @@ impl Ensemble { Self { strategy } } - async fn fuse(&self, entities: Entities) -> Result { + async fn fuse(&self, entities: Entities) -> Result { let before = entities.len(); let result = self.merge(entities); tracing::debug!(target: TARGET, before, after = result.len(), "fused entities"); - Ok(result) + Ok(RefinedEntities(result)) } /// Group entities by `(kind, value, overlapping location)` then fuse @@ -87,7 +88,9 @@ impl Ensemble { let mut total_weight = 0.0; let mut weighted_sum = 0.0; for e in &group { - let w = weights.get(&e.detection_method).copied().unwrap_or(1.0); + // Use the first recognition method for weight lookup. + let primary = e.recognition_methods.first(); + let w = primary.and_then(|m| weights.get(m)).copied().unwrap_or(1.0); weighted_sum += e.confidence * w; total_weight += w; } @@ -104,17 +107,30 @@ impl Ensemble { } }; - // Use the first entity as the base and update confidence/method. + // Collect all recognition methods from the group in order. + let mut merged_methods = Vec::new(); + for e in &group { + for m in &e.recognition_methods { + if !merged_methods.contains(m) { + merged_methods.push(*m); + } + } + } + + // Use the first entity as the base and update confidence/methods. let mut result = group.into_iter().next().unwrap(); result.confidence = fused_confidence; - result.detection_method = DetectionMethod::Composite; + result.recognition_methods = merged_methods; + result + .refinement_methods + .push(RefinementMethod::EnsembleFusion); result } } impl Operation for Ensemble { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.fuse(data)).await @@ -138,13 +154,13 @@ mod tests { fn text_entity( value: &str, - method: DetectionMethod, + method: RecognitionMethod, confidence: f64, start: usize, end: usize, ) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, method, @@ -164,22 +180,29 @@ mod tests { fn max_confidence_strategy() { let merge = Ensemble::new(FusionStrategy::MaxConfidence); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.85, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.85, 0, 4), ] .into(); let result = merge.merge(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.85).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Composite); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Ner] + ); + assert_eq!( + result[0].refinement_methods, + vec![RefinementMethod::EnsembleFusion] + ); } #[test] fn noisy_or_strategy() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.8, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.8, 0, 4), ] .into(); let result = merge.merge(entities); @@ -191,13 +214,13 @@ mod tests { #[test] fn weighted_average_strategy() { let mut weights = HashMap::new(); - weights.insert(DetectionMethod::Regex, 1.0); - weights.insert(DetectionMethod::Ner, 2.0); + weights.insert(RecognitionMethod::Regex, 1.0); + weights.insert(RecognitionMethod::Ner, 2.0); let merge = Ensemble::new(FusionStrategy::WeightedAverage { weights }); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.6, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.9, 0, 4), + text_entity("John", RecognitionMethod::Regex, 0.6, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.9, 0, 4), ] .into(); let result = merge.merge(entities); @@ -210,8 +233,8 @@ mod tests { fn non_overlapping_not_merged() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = vec![ - text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), - text_entity("John", DetectionMethod::Ner, 0.8, 10, 14), + text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4), + text_entity("John", RecognitionMethod::Ner, 0.8, 10, 14), ] .into(); let result = merge.merge(entities); @@ -222,11 +245,14 @@ mod tests { fn single_entity_unchanged() { let merge = Ensemble::new(FusionStrategy::NoisyOr); let entities: Entities = - vec![text_entity("John", DetectionMethod::Regex, 0.7, 0, 4)].into(); + vec![text_entity("John", RecognitionMethod::Regex, 0.7, 0, 4)].into(); let result = merge.merge(entities); assert_eq!(result.len(), 1); assert!((result[0].confidence - 0.7).abs() < f64::EPSILON); - assert_eq!(result[0].detection_method, DetectionMethod::Regex); + assert_eq!( + result[0].recognition_methods, + vec![RecognitionMethod::Regex] + ); } #[test] diff --git a/crates/nvisy-engine/src/operation/processing/manual_detection.rs b/crates/nvisy-engine/src/operation/processing/manual_detection.rs index f5cd427e..59ce3b70 100644 --- a/crates/nvisy-engine/src/operation/processing/manual_detection.rs +++ b/crates/nvisy-engine/src/operation/processing/manual_detection.rs @@ -5,7 +5,7 @@ use nvisy_core::Result; use nvisy_ontology::entity::{ - Annotation, AnnotationKind, DetectionMethod, Entities, Entity, Location, + Annotation, AnnotationKind, Entities, Entity, Location, RecognitionMethod, }; use serde::Deserialize; @@ -37,7 +37,7 @@ pub struct ManualOutput { } /// Converts each inclusion [`Annotation`] into a full [`Entity`] with -/// `DetectionMethod::Manual` and confidence 1.0. Collects exclusion +/// `RecognitionMethod::Manual` and confidence 1.0. Collects exclusion /// annotations for downstream filtering. pub struct ManualDetection; @@ -54,8 +54,8 @@ impl ManualDetection { for ann in &annotations { match ann.kind { AnnotationKind::Inclusion => { - let category = match &ann.category { - Some(c) => c.clone(), + let category = match ann.category { + Some(c) => c, None => continue, }; let entity_kind = match ann.entity_kind { @@ -65,7 +65,7 @@ impl ManualDetection { let value = ann.value.clone().unwrap_or_default(); let mut entity = - Entity::new(category, entity_kind, value, DetectionMethod::Manual, 1.0); + Entity::new(category, entity_kind, value, RecognitionMethod::Manual, 1.0); entity.location = ann.location.clone(); entities.push(entity); } @@ -128,10 +128,10 @@ mod tests { fn make_entity(value: &str, start: usize, end: usize) -> Entity { Entity::new( - EntityCategory::Pii, + EntityCategory::PersonalIdentity, EntityKind::PersonName, value, - DetectionMethod::Manual, + RecognitionMethod::Manual, 1.0, ) .with_location( @@ -199,7 +199,7 @@ mod tests { let annotations = vec![ Annotation { kind: AnnotationKind::Inclusion, - category: Some(EntityCategory::Pii), + category: Some(EntityCategory::PersonalIdentity), entity_kind: Some(EntityKind::PersonName), value: Some("Alice".into()), location: None, diff --git a/crates/nvisy-engine/src/operation/processing/mod.rs b/crates/nvisy-engine/src/operation/processing/mod.rs index 32eb85b9..a3e9564b 100644 --- a/crates/nvisy-engine/src/operation/processing/mod.rs +++ b/crates/nvisy-engine/src/operation/processing/mod.rs @@ -22,12 +22,12 @@ mod policy_evaluation; mod redaction; mod validation; -pub use deduplication::Deduplication; -pub use ensemble_fusion::{Ensemble, FusionStrategy}; -pub use manual_detection::{ +pub use self::deduplication::Deduplication; +pub use self::ensemble_fusion::{Ensemble, FusionStrategy}; +pub use self::manual_detection::{ Exclusion, ManualDetection, ManualDetectionParams, ManualOutput, is_excluded, }; -pub use pattern_match::{PatternDetectionParams, PatternInput, PatternMatch}; -pub use policy_evaluation::{EvaluatePolicy, EvaluatePolicyParams}; -pub use redaction::{Redaction, RedactionInput, RedactionOutput}; -pub use validation::Validation; +pub use self::pattern_match::{PatternDetectionParams, PatternMatch}; +pub use self::policy_evaluation::{EvaluatePolicy, EvaluatePolicyParams}; +pub use self::redaction::{Redaction, RedactionInput, RedactionOutput}; +pub use self::validation::Validation; diff --git a/crates/nvisy-engine/src/operation/processing/pattern_match.rs b/crates/nvisy-engine/src/operation/processing/pattern_match.rs index 3174432d..d41d745a 100644 --- a/crates/nvisy-engine/src/operation/processing/pattern_match.rs +++ b/crates/nvisy-engine/src/operation/processing/pattern_match.rs @@ -1,19 +1,17 @@ //! Pattern-based PII/PHI entity detection operation. //! -//! Operates on text, CSV, HTML, and JSON spans, running both compiled +//! Scans type-erased text spans (`Span`) using compiled //! regex patterns and dictionary automata via [`PatternEngine`]. use nvisy_codec::Span; -use nvisy_codec::handler::{CsvSpan, HtmlSpan, JsonPath, TxtSpan}; +use nvisy_codec::handler::TextData; use nvisy_core::{Error, Result}; -use nvisy_ontology::entity::{DetectionMethod, Entities, Entity, TabularLocation, TextLocation}; -use nvisy_pattern::{ - ContextRule, DetectionSource, PatternEngine, PatternEngineBuilder, - PatternMatch as PatternMatchResult, -}; +use nvisy_ontology::entity::TextLocation; +use nvisy_pattern::patterns::ContextRule; +use nvisy_pattern::{PatternEngine, PatternEngineBuilder, RawMatch, ScanContext}; use serde::Deserialize; -use serde_json::Value; +use crate::operation::envelope::DetectedEntities; use crate::operation::{Operation, ParallelContext}; const TARGET: &str = "nvisy_engine::op::pattern_match"; @@ -28,19 +26,13 @@ pub struct PatternDetectionParams { pub patterns: Option>, } -/// Multi-modality input for pattern matching. -pub enum PatternInput { - Text(Vec>), - Csv(Vec>), - - Html(Vec>), - Json(Vec>), -} - /// Pattern detection operation backed by [`PatternEngine`]. /// -/// Handles both regex and dictionary matches, replacing the former -/// separate `DictionaryDetection`. +/// Accepts type-erased text spans from any [`TextHandler`] (plain text, +/// CSV, HTML, JSON, etc.) and detects entities using regex and dictionary +/// patterns with co-occurrence boosting. +/// +/// [`TextHandler`]: nvisy_codec::handler::TextHandler pub struct PatternMatch { engine: PatternEngine, } @@ -58,232 +50,59 @@ impl PatternMatch { .map_err(|e| Error::validation(e.to_string(), "pattern-detection"))?; Ok(Self { engine }) } -} - -impl PatternMatch { - async fn scan(&self, data: PatternInput) -> Result { - tracing::debug!(target: TARGET, "scanning for patterns"); - match data { - PatternInput::Text(spans) => self.detect_text(spans), - PatternInput::Csv(spans) => self.detect_csv(spans), - PatternInput::Html(spans) => self.detect_html(spans), - PatternInput::Json(spans) => self.detect_json(spans), - } - } -} - -impl Operation for PatternMatch { - type Input = ParallelContext; - type Output = ParallelContext; - - async fn call(&self, input: Self::Input) -> Result { - input.parallel_map(|data| self.scan(data)).await - } -} - -impl PatternMatch { - fn detect_text(&self, spans: Vec>) -> Result { - // Phase 1: collect raw matches per span index. - let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - - for (idx, span) in spans.iter().enumerate() { - for m in self.engine.scan_text(&span.data) { - raw_matches.push((idx, m)); - } - } - - // Phase 2: apply co-occurrence boost and build entities. - let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) - } else { - m.confidence - }; - - let method = detection_method(m.source); - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[*span_idx].id.0.to_string()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[*span_idx].source); - - entities.push(entity); - } - - Ok(entities.into()) - } - - fn detect_csv(&self, spans: Vec>) -> Result { - // Collect all span data (including headers) for co-occurrence window. - let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - - // Phase 1: collect raw matches per span index (skip headers). - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - for (idx, span) in spans.iter().enumerate() { - if span.id.header || span.data.is_empty() { - continue; - } - for m in self.engine.scan_text(&span.data) { - raw_matches.push((idx, m)); - } - } - - // Phase 2: apply co-occurrence boost and build entities. - let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) - } else { - m.confidence - }; - - let method = detection_method(m.source); - let span = &spans[*span_idx]; - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TabularLocation { - row_index: span.id.row, - column_index: span.id.col, - start_offset: Some(m.start), - end_offset: Some(m.end), - column_name: None, - sheet_name: None, - } - .into(), - ) - .with_parent(&span.source); - - entities.push(entity); - } - Ok(entities.into()) - } + fn detect(&self, spans: Vec>) -> Result { + tracing::debug!(target: TARGET, span_count = spans.len(), "scanning for patterns"); - fn detect_html(&self, spans: Vec>) -> Result { let span_data: Vec<&str> = spans.iter().map(|s| s.data.as_str()).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); + let mut raw_matches: Vec<(usize, RawMatch)> = Vec::new(); + let scan_ctx = ScanContext::default(); for (idx, span) in spans.iter().enumerate() { - for m in self.engine.scan_text(&span.data) { + for m in self.engine.scan_text(span.data.as_str(), &scan_ctx) { raw_matches.push((idx, m)); } } let mut entities = Vec::new(); - for (span_idx, m) in &raw_matches { - let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *span_idx, ctx, m.confidence) - } else { - m.confidence - }; - - let method = detection_method(m.source); - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[*span_idx].id.0.to_string()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[*span_idx].source); - - entities.push(entity); - } - - Ok(entities.into()) - } - - fn detect_json(&self, spans: Vec>) -> Result { - // Filter to string-valued spans and collect text for co-occurrence. - let string_spans: Vec<(usize, &str)> = spans - .iter() - .enumerate() - .filter_map(|(idx, s)| s.data.as_str().map(|text| (idx, text))) - .collect(); - - let span_data: Vec<&str> = string_spans.iter().map(|(_, text)| *text).collect(); - let mut raw_matches: Vec<(usize, PatternMatchResult)> = Vec::new(); - - for (co_idx, (_, text)) in string_spans.iter().enumerate() { - for m in self.engine.scan_text(text) { - raw_matches.push((co_idx, m)); - } - } - - let mut entities = Vec::new(); - for (co_idx, m) in &raw_matches { + for (span_idx, m) in raw_matches { let confidence = if let Some(ref ctx) = m.context { - apply_cooccurrence(&span_data, *co_idx, ctx, m.confidence) + apply_cooccurrence(&span_data, span_idx, ctx, m.confidence) } else { m.confidence }; - - let method = detection_method(m.source); - let (orig_idx, _) = string_spans[*co_idx]; - - let entity = Entity::new( - m.category.clone(), - m.entity_kind, - &m.value, - method, - confidence, - ) - .with_location( - TextLocation { - start_offset: m.start, - end_offset: m.end, - element_id: Some(spans[orig_idx].id.pointer.clone()), - ..Default::default() - } - .into(), - ) - .with_parent(&spans[orig_idx].source); + let start = m.start; + let end = m.end; + let element_id = spans[span_idx].id.to_string(); + let source = spans[span_idx].source; + + let mut entity = m.into_entity(); + entity.confidence = confidence; + let entity = entity + .with_location( + TextLocation { + start_offset: start, + end_offset: end, + element_id: Some(element_id), + ..Default::default() + } + .into(), + ) + .with_parent(&source); entities.push(entity); } - Ok(entities.into()) + Ok(DetectedEntities(entities.into())) } } -/// Map a [`DetectionSource`] to a [`DetectionMethod`]. -fn detection_method(source: DetectionSource) -> DetectionMethod { - match source { - DetectionSource::Regex => DetectionMethod::Regex, - DetectionSource::Dictionary => DetectionMethod::Dictionary, - DetectionSource::DenyList => DetectionMethod::Dictionary, +impl Operation for PatternMatch { + type Input = ParallelContext>>; + type Output = ParallelContext; + + async fn call(&self, input: Self::Input) -> Result { + input.parallel_map(|data| async { self.detect(data) }).await } } diff --git a/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs b/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs index 519ae369..9e93793e 100644 --- a/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs +++ b/crates/nvisy-engine/src/operation/processing/policy_evaluation.rs @@ -13,6 +13,7 @@ use nvisy_ontology::entity::{Entities, Entity}; use nvisy_ontology::policy::{PolicyRule, RuleAction, Strategy, TextStrategy}; use serde::Deserialize; +use crate::operation::envelope::PolicyOutcome; use crate::operation::{Operation, ParallelContext}; use crate::provenance::{RedactionDecision, RedactionRecord}; @@ -40,14 +41,6 @@ fn default_threshold() -> f64 { 0.5 } -/// Output of policy evaluation: both pipeline decisions and audit records. -pub struct EvaluatePolicyOutput { - /// Pipeline-facing redaction decisions. - pub decisions: Vec, - /// Audit-facing redaction records. - pub records: Vec, -} - /// Evaluates policy rules against detected entities and produces /// [`RedactionDecision`] and [`RedactionRecord`] pairs. /// @@ -64,7 +57,7 @@ impl EvaluatePolicy { Ok(Self { params }) } - pub async fn execute(&self, entities: Entities) -> Result { + pub async fn execute(&self, entities: Entities) -> Result { tracing::debug!(target: TARGET, entity_count = entities.len(), "evaluating policies"); let default_spec = &self.params.default_spec; let default_threshold = self.params.default_confidence_threshold; @@ -116,13 +109,13 @@ impl EvaluatePolicy { records.push(record); } - Ok(EvaluatePolicyOutput { decisions, records }) + Ok(PolicyOutcome { decisions, records }) } } impl Operation for EvaluatePolicy { type Input = ParallelContext; - type Output = ParallelContext; + type Output = ParallelContext; async fn call(&self, input: Self::Input) -> Result { input.parallel_map(|data| self.execute(data)).await diff --git a/crates/nvisy-engine/src/pipeline/mod.rs b/crates/nvisy-engine/src/pipeline/mod.rs index dd15995b..350a0986 100644 --- a/crates/nvisy-engine/src/pipeline/mod.rs +++ b/crates/nvisy-engine/src/pipeline/mod.rs @@ -16,17 +16,19 @@ mod runs; use std::future::Future; -pub use config::{EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection}; -pub use default::DefaultEngine; -pub use executor::{NodeOutput, RunOutput}; use nvisy_core::Error; use nvisy_ontology::context::Contexts; use nvisy_ontology::entity::DetectionOutput; use nvisy_ontology::policy::{Policies, RedactionSummary}; -pub use ontology::{Explainable, Explanation}; -pub use runs::{NodeProgress, RunManager, RunState, RunStatus, RunSummary}; use uuid::Uuid; +pub use self::config::{ + EngineSection, LlmSection, OcrSection, RuntimeConfig, SttSection, TtsSection, +}; +pub use self::default::DefaultEngine; +pub use self::executor::{NodeOutput, RunOutput}; +pub use self::ontology::{Explainable, Explanation}; +pub use self::runs::{NodeProgress, RunManager, RunState, RunStatus, RunSummary}; use crate::compiler::Graph; use crate::provenance::{Audit, PolicyEvaluation, RedactionMap}; diff --git a/crates/nvisy-engine/src/pipeline/ontology.rs b/crates/nvisy-engine/src/pipeline/ontology.rs index e26765a3..fefb1503 100644 --- a/crates/nvisy-engine/src/pipeline/ontology.rs +++ b/crates/nvisy-engine/src/pipeline/ontology.rs @@ -1,10 +1,10 @@ //! Explainability metadata for data protection decisions. //! -//! An [`Explanation`] records why an action was taken — which model, rule, +//! An [`Explanation`] records why an action was taken: which model, rule, //! and confidence level were involved. Types that carry this metadata //! implement the [`Explainable`] trait. -use nvisy_ontology::entity::{DetectionMethod, ModelInfo}; +use nvisy_ontology::entity::{ModelInfo, RecognitionMethod}; use schemars::JsonSchema; use semver::Version; use serde::{Deserialize, Serialize}; @@ -31,9 +31,9 @@ pub struct Explanation { /// Detection confidence score. #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option, - /// Detection method used. + /// Recognition method used. #[serde(skip_serializing_if = "Option::is_none")] - pub detection_method: Option, + pub recognition_method: Option, /// Human-readable reason for the action. #[serde(skip_serializing_if = "Option::is_none")] pub reason: Option, diff --git a/crates/nvisy-engine/src/pipeline/policy/mod.rs b/crates/nvisy-engine/src/pipeline/policy/mod.rs index 5fd868ac..a5b29ec7 100644 --- a/crates/nvisy-engine/src/pipeline/policy/mod.rs +++ b/crates/nvisy-engine/src/pipeline/policy/mod.rs @@ -9,5 +9,5 @@ mod retry; mod timeout; -pub use retry::CompiledRetryPolicy; -pub use timeout::CompiledTimeoutPolicy; +pub use self::retry::CompiledRetryPolicy; +pub use self::timeout::CompiledTimeoutPolicy; diff --git a/crates/nvisy-engine/src/provenance/action/mod.rs b/crates/nvisy-engine/src/provenance/action/mod.rs index 3fac7e84..ffba31f8 100644 --- a/crates/nvisy-engine/src/provenance/action/mod.rs +++ b/crates/nvisy-engine/src/provenance/action/mod.rs @@ -7,6 +7,6 @@ mod inference; mod lifecycle; mod processing; -pub use inference::{InferenceAction, InferenceActionBuilder}; -pub use lifecycle::{LifecycleAction, LifecycleActionBuilder}; -pub use processing::{ProcessingAction, ProcessingActionBuilder}; +pub use self::inference::{InferenceAction, InferenceActionBuilder}; +pub use self::lifecycle::{LifecycleAction, LifecycleActionBuilder}; +pub use self::processing::{ProcessingAction, ProcessingActionBuilder}; diff --git a/crates/nvisy-engine/src/provenance/kind.rs b/crates/nvisy-engine/src/provenance/kind.rs index 16cb6024..ef87446a 100644 --- a/crates/nvisy-engine/src/provenance/kind.rs +++ b/crates/nvisy-engine/src/provenance/kind.rs @@ -1,6 +1,6 @@ //! Two-level tagged enum discriminating audit entry categories. -use nvisy_ontology::entity::DetectionMethod; +use nvisy_ontology::entity::ExtractionMethod; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -27,16 +27,18 @@ pub enum InferenceKind { } impl InferenceKind { - /// Returns the [`DetectionMethod`] that corresponds to this inference kind. - pub fn detection_method(&self) -> DetectionMethod { + /// Returns the [`ExtractionMethod`] for inference kinds that perform + /// content extraction. Returns `None` for pure recognition or + /// non-extraction operations. + pub fn extraction_method(&self) -> Option { match self { - Self::Ocr(_) => DetectionMethod::Ocr, - Self::Transcription(_) => DetectionMethod::SpeechTranscript, - Self::Ner(_) => DetectionMethod::Ner, - Self::ComputerVision(_) => DetectionMethod::ObjectDetection, - Self::Translation(_) | Self::Classification(_) | Self::Summarization(_) => { - DetectionMethod::ContextualNlp - } + Self::Ocr(_) => Some(ExtractionMethod::OpticalCharacterRecognition), + Self::Transcription(_) => Some(ExtractionMethod::Transcription), + Self::ComputerVision(_) => Some(ExtractionMethod::ObjectDetection), + Self::Ner(_) + | Self::Translation(_) + | Self::Classification(_) + | Self::Summarization(_) => None, } } } diff --git a/crates/nvisy-engine/src/provenance/mod.rs b/crates/nvisy-engine/src/provenance/mod.rs index ad54b5cd..bc888b20 100644 --- a/crates/nvisy-engine/src/provenance/mod.rs +++ b/crates/nvisy-engine/src/provenance/mod.rs @@ -19,14 +19,14 @@ mod kind; mod action; mod record; -pub use action::{ +pub use self::action::{ InferenceAction, InferenceActionBuilder, LifecycleAction, LifecycleActionBuilder, ProcessingAction, ProcessingActionBuilder, }; -pub use audit::Audit; -pub use entry::{AuditEntry, AuditEntryBuilder, AuditEntryBuilderError, AuditEntryStatus}; -pub use kind::{AuditEntryKind, InferenceKind, LifecycleKind, ProcessingKind}; -pub use record::{ +pub use self::audit::Audit; +pub use self::entry::{AuditEntry, AuditEntryBuilder, AuditEntryBuilderError, AuditEntryStatus}; +pub use self::kind::{AuditEntryKind, InferenceKind, LifecycleKind, ProcessingKind}; +pub use self::record::{ PolicyEvaluation, RedactionDecision, RedactionMap, RedactionMapEntry, RedactionRecord, ReviewDecision, ReviewStatus, }; diff --git a/crates/nvisy-engine/src/provenance/record/mod.rs b/crates/nvisy-engine/src/provenance/record/mod.rs index 7d9cdac6..e65bf958 100644 --- a/crates/nvisy-engine/src/provenance/record/mod.rs +++ b/crates/nvisy-engine/src/provenance/record/mod.rs @@ -7,8 +7,8 @@ mod map; mod redaction; mod review; -pub use decision::RedactionDecision; -pub use evaluation::PolicyEvaluation; -pub use map::{RedactionMap, RedactionMapEntry}; -pub use redaction::RedactionRecord; -pub use review::{ReviewDecision, ReviewStatus}; +pub use self::decision::RedactionDecision; +pub use self::evaluation::PolicyEvaluation; +pub use self::map::{RedactionMap, RedactionMapEntry}; +pub use self::redaction::RedactionRecord; +pub use self::review::{ReviewDecision, ReviewStatus}; diff --git a/crates/nvisy-http/src/lib.rs b/crates/nvisy-http/src/lib.rs index 7ab3e9bd..b6aaad6b 100644 --- a/crates/nvisy-http/src/lib.rs +++ b/crates/nvisy-http/src/lib.rs @@ -5,7 +5,7 @@ mod client; mod middleware; -pub use client::{HttpClient, HttpConfig}; +pub use self::client::{HttpClient, HttpConfig}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ocr/src/backend/mod.rs b/crates/nvisy-ocr/src/backend/mod.rs index f3ad815f..10adbcd9 100644 --- a/crates/nvisy-ocr/src/backend/mod.rs +++ b/crates/nvisy-ocr/src/backend/mod.rs @@ -3,13 +3,14 @@ mod input; mod output; -pub use input::ImageInput; use nvisy_core::Error; pub use nvisy_core::media::ImageFormat; -pub use output::{Block, BlockKind, ImageOutput, Line, Page, Word}; use reqwest_middleware::reqwest::Response; use reqwest_middleware::reqwest::multipart::Part; +pub use self::input::ImageInput; +pub use self::output::{Block, BlockKind, ImageOutput, Line, Page, Word}; + /// Build a multipart [`Part`] from an [`ImageInput`]. pub(crate) fn image_part(image: &ImageInput) -> Result { let filename = format!("image.{}", image.format.extension()); diff --git a/crates/nvisy-ocr/src/engine/mod.rs b/crates/nvisy-ocr/src/engine/mod.rs index 6a6c8024..7893219d 100644 --- a/crates/nvisy-ocr/src/engine/mod.rs +++ b/crates/nvisy-ocr/src/engine/mod.rs @@ -6,9 +6,9 @@ use std::fmt; use std::sync::Arc; use nvisy_core::Error; -pub use params::OcrProvider; use tracing::instrument; +pub use self::params::OcrProvider; use crate::backend::{Backend, ImageInput, ImageOutput, RunParams}; /// Type-erased OCR engine wrapping any [`Backend`] implementation. diff --git a/crates/nvisy-ocr/src/lib.rs b/crates/nvisy-ocr/src/lib.rs index ae0dfbaf..e1078f74 100644 --- a/crates/nvisy-ocr/src/lib.rs +++ b/crates/nvisy-ocr/src/lib.rs @@ -9,7 +9,7 @@ pub mod provider; #[doc(hidden)] pub mod prelude; -pub use backend::{ +pub use self::backend::{ Backend, Block, BlockKind, ImageFormat, ImageInput, ImageOutput, Line, Page, RunParams, Word, }; -pub use engine::{OcrEngine, OcrProvider}; +pub use self::engine::{OcrEngine, OcrProvider}; diff --git a/crates/nvisy-ocr/src/provider/aws_textract/mod.rs b/crates/nvisy-ocr/src/provider/aws_textract/mod.rs index 26d9ca77..2bae9383 100644 --- a/crates/nvisy-ocr/src/provider/aws_textract/mod.rs +++ b/crates/nvisy-ocr/src/provider/aws_textract/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::AwsTextractBackend; -pub use params::AwsTextractParams; +pub use self::backend::AwsTextractBackend; +pub use self::params::AwsTextractParams; diff --git a/crates/nvisy-ocr/src/provider/azure_docai/mod.rs b/crates/nvisy-ocr/src/provider/azure_docai/mod.rs index 24f46228..5c09835f 100644 --- a/crates/nvisy-ocr/src/provider/azure_docai/mod.rs +++ b/crates/nvisy-ocr/src/provider/azure_docai/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::AzureDocaiBackend; -pub use params::AzureDocaiParams; +pub use self::backend::AzureDocaiBackend; +pub use self::params::AzureDocaiParams; diff --git a/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs b/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs index bb595da0..793fe5c8 100644 --- a/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs +++ b/crates/nvisy-ocr/src/provider/datalab_surya/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::SuryaBackend; -pub use params::SuryaParams; +pub use self::backend::SuryaBackend; +pub use self::params::SuryaParams; diff --git a/crates/nvisy-ocr/src/provider/google_vision/mod.rs b/crates/nvisy-ocr/src/provider/google_vision/mod.rs index 803dda9b..e45502ec 100644 --- a/crates/nvisy-ocr/src/provider/google_vision/mod.rs +++ b/crates/nvisy-ocr/src/provider/google_vision/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::GoogleVisionBackend; -pub use params::GoogleVisionParams; +pub use self::backend::GoogleVisionBackend; +pub use self::params::GoogleVisionParams; diff --git a/crates/nvisy-ocr/src/provider/mod.rs b/crates/nvisy-ocr/src/provider/mod.rs index bf4b7831..b4509b4a 100644 --- a/crates/nvisy-ocr/src/provider/mod.rs +++ b/crates/nvisy-ocr/src/provider/mod.rs @@ -3,8 +3,8 @@ mod datalab_surya; mod paddle_paddlex; -pub use datalab_surya::{SuryaBackend, SuryaParams}; -pub use paddle_paddlex::{PaddleXBackend, PaddleXParams}; +pub use self::datalab_surya::{SuryaBackend, SuryaParams}; +pub use self::paddle_paddlex::{PaddleXBackend, PaddleXParams}; #[cfg(feature = "aws-textract")] #[cfg_attr(docsrs, doc(cfg(feature = "aws-textract")))] @@ -18,10 +18,10 @@ mod google_vision; #[cfg(feature = "aws-textract")] #[cfg_attr(docsrs, doc(cfg(feature = "aws-textract")))] -pub use aws_textract::{AwsTextractBackend, AwsTextractParams}; +pub use self::aws_textract::{AwsTextractBackend, AwsTextractParams}; #[cfg(feature = "azure-docai")] #[cfg_attr(docsrs, doc(cfg(feature = "azure-docai")))] -pub use azure_docai::{AzureDocaiBackend, AzureDocaiParams}; +pub use self::azure_docai::{AzureDocaiBackend, AzureDocaiParams}; #[cfg(feature = "google-vision")] #[cfg_attr(docsrs, doc(cfg(feature = "google-vision")))] -pub use google_vision::{GoogleVisionBackend, GoogleVisionParams}; +pub use self::google_vision::{GoogleVisionBackend, GoogleVisionParams}; diff --git a/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs b/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs index 5c02e823..7d6f28ec 100644 --- a/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs +++ b/crates/nvisy-ocr/src/provider/paddle_paddlex/mod.rs @@ -6,5 +6,5 @@ mod backend; mod params; -pub use backend::PaddleXBackend; -pub use params::PaddleXParams; +pub use self::backend::PaddleXBackend; +pub use self::params::PaddleXParams; diff --git a/crates/nvisy-ontology/src/context/analytic/mod.rs b/crates/nvisy-ontology/src/context/analytic/mod.rs index 841d9a93..5b2c7aef 100644 --- a/crates/nvisy-ontology/src/context/analytic/mod.rs +++ b/crates/nvisy-ontology/src/context/analytic/mod.rs @@ -3,11 +3,12 @@ mod embedding; mod pattern; -pub use embedding::EmbeddingData; -pub use pattern::PatternData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::embedding::EmbeddingData; +pub use self::pattern::PatternData; + /// Analytic computation variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/context/biometric/mod.rs b/crates/nvisy-ontology/src/context/biometric/mod.rs index 79f9decc..de235e5a 100644 --- a/crates/nvisy-ontology/src/context/biometric/mod.rs +++ b/crates/nvisy-ontology/src/context/biometric/mod.rs @@ -3,10 +3,11 @@ mod face; mod voice; -pub use face::FaceData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use voice::VoiceData; + +pub use self::face::FaceData; +pub use self::voice::VoiceData; /// Biometric identity verification variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/document/mod.rs b/crates/nvisy-ontology/src/context/document/mod.rs index 736aaa56..b34d15e5 100644 --- a/crates/nvisy-ontology/src/context/document/mod.rs +++ b/crates/nvisy-ontology/src/context/document/mod.rs @@ -5,8 +5,9 @@ mod template; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use signature::SignatureData; -pub use template::TemplateData; + +pub use self::signature::SignatureData; +pub use self::template::TemplateData; /// Document-related reference variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/geospatial/mod.rs b/crates/nvisy-ontology/src/context/geospatial/mod.rs index e3426517..0e81d0ba 100644 --- a/crates/nvisy-ontology/src/context/geospatial/mod.rs +++ b/crates/nvisy-ontology/src/context/geospatial/mod.rs @@ -4,12 +4,13 @@ mod address; mod coordinates; mod region; -pub use address::AddressData; -pub use coordinates::GeoCoordinate; -pub use region::{GeoBounds, GeoShape, RegionData}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::address::AddressData; +pub use self::coordinates::GeoCoordinate; +pub use self::region::{GeoBounds, GeoShape, RegionData}; + /// Geospatial location variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/context/mod.rs b/crates/nvisy-ontology/src/context/mod.rs index e49872f1..9c293157 100644 --- a/crates/nvisy-ontology/src/context/mod.rs +++ b/crates/nvisy-ontology/src/context/mod.rs @@ -12,11 +12,12 @@ pub mod geospatial; pub mod reference; pub mod temporal; -pub use entry::{ContextEntry, ContextEntryData}; use nvisy_core::content::ContentSource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::entry::{ContextEntry, ContextEntryData}; + /// A collection of [`Context`]s attached to a pipeline run. #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] pub struct Contexts { diff --git a/crates/nvisy-ontology/src/context/reference/mod.rs b/crates/nvisy-ontology/src/context/reference/mod.rs index d6ba0167..e7b0be2e 100644 --- a/crates/nvisy-ontology/src/context/reference/mod.rs +++ b/crates/nvisy-ontology/src/context/reference/mod.rs @@ -5,12 +5,13 @@ mod image; mod tag; mod text; -pub use credential::CredentialData; -pub use image::ImageData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use tag::TagData; -pub use text::{TextData, TextEntry}; + +pub use self::credential::CredentialData; +pub use self::image::ImageData; +pub use self::tag::TagData; +pub use self::text::{TextData, TextEntry}; /// Direct comparison reference variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] diff --git a/crates/nvisy-ontology/src/context/temporal/mod.rs b/crates/nvisy-ontology/src/context/temporal/mod.rs index 5fa8f727..7055d353 100644 --- a/crates/nvisy-ontology/src/context/temporal/mod.rs +++ b/crates/nvisy-ontology/src/context/temporal/mod.rs @@ -2,10 +2,11 @@ mod date; -pub use date::DateData; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +pub use self::date::DateData; + /// Temporal matching variants. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/entity/category.rs b/crates/nvisy-ontology/src/entity/category.rs index 4e394892..5460051e 100644 --- a/crates/nvisy-ontology/src/entity/category.rs +++ b/crates/nvisy-ontology/src/entity/category.rs @@ -1,31 +1,55 @@ -//! Shared entity category tag. +//! Broad entity category classification. //! -//! [`EntityCategory`] classifies detected sensitive data into broad -//! categories used by both detection and pattern matching crates. +//! [`EntityCategory`] groups related [`EntityKind`](super::EntityKind) +//! variants into policy-addressable buckets. Policy selectors can +//! target an entire category (e.g. "redact all financial data") without +//! enumerating individual kinds. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use strum::{Display, EnumString}; -/// Category of sensitive data an entity belongs to. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Display, EnumString)] -#[derive(Serialize, Deserialize, JsonSchema)] +/// Broad category of sensitive data. +/// +/// Each [`EntityKind`](super::EntityKind) maps to exactly one category +/// via [`EntityKind::category()`](super::EntityKind::category). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum EntityCategory { - /// Personally Identifiable Information (names, SSNs, addresses, etc.). - Pii, - /// Protected Health Information (HIPAA-regulated data). - Phi, - /// Financial data (credit card numbers, bank accounts, etc.). + /// Personal identity: names, government IDs, dates of birth, and + /// other attributes that directly identify a natural person. + PersonalIdentity, + /// Contact information: email addresses, phone numbers, physical + /// addresses, postal codes, and URLs. + ContactInfo, + /// Demographic attributes: age, gender, ethnicity, religion, + /// nationality, and citizenship. + Demographic, + /// Financial instruments and accounts: payment cards, bank + /// accounts, routing numbers, IBAN, crypto addresses, and + /// monetary amounts. Financial, - /// Secrets and credentials (API keys, passwords, tokens). - Credentials, - /// Legal documents and privileged communications. - Legal, - /// Biometric data (fingerprints, iris scans, voiceprints). + /// Protected health information: medical record numbers, + /// insurance IDs, prescriptions, diagnoses, and medications. + Health, + /// Biometric identifiers: fingerprints, voiceprints, retina + /// scans, and facial geometry templates. Biometric, - /// User-defined or plugin-specific category. - #[strum(default)] - Custom(String), + /// Secrets and credentials: passwords, API keys, authentication + /// tokens, and private cryptographic keys. + Credentials, + /// Network and device identifiers: IP addresses, MAC addresses, + /// device IDs, and usernames. + NetworkIdentifier, + /// Geographic and spatial data: GPS coordinates and geolocation + /// metadata. + Location, + /// Sensitive visual elements detected in images or video: + /// faces, handwriting, signatures, logos, and barcodes. + Visual, + /// Organizational identifiers: company names, departments, + /// facilities, and institutional reference numbers. + Organizational, } diff --git a/crates/nvisy-ontology/src/entity/kind.rs b/crates/nvisy-ontology/src/entity/kind.rs index dbf0a2ea..3c74d0a4 100644 --- a/crates/nvisy-ontology/src/entity/kind.rs +++ b/crates/nvisy-ontology/src/entity/kind.rs @@ -4,8 +4,9 @@ //! can detect or redact. Each variant maps to a stable `snake_case` //! string for serialization and display. //! -//! Every variant also maps to an [`EntityCategory`] via [`EntityKind::category`] -//! and an [`EntitySensitivity`] via [`EntityKind::sensitivity`]. +//! Every variant also maps to: +//! - an [`EntityCategory`] via [`EntityKind::category`], +//! - an [`EntitySensitivity`] via [`EntityKind::sensitivity`]. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -20,7 +21,11 @@ use super::sensitivity::EntitySensitivity; #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum EntityKind { - // Identity documents: + // Personal identity + /// Person name (full, first, or last). + PersonName, + /// Date of birth. + DateOfBirth, /// Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.). GovernmentId, /// Tax identification number (ITIN, EIN, TIN, etc.). @@ -29,44 +34,42 @@ pub enum EntityKind { DriversLicense, /// Passport number. PassportNumber, + /// National insurance or social-security equivalent (NI, BSN, AHVN, etc.). + NationalInsuranceNumber, /// Vehicle identification number (VIN). VehicleId, /// License plate number. LicensePlate, - // Personal information: - /// Person name (full, first, or last). - PersonName, - /// Date of birth. - DateOfBirth, - /// Age value. - Age, - /// Demographic attribute (gender, race/ethnicity, religion, orientation, etc.). - Demographic, - - // Contact information: + // Contact information /// Email address. EmailAddress, /// Phone number. PhoneNumber, - /// Physical / mailing address. + /// Physical or mailing address. Address, /// Postal or ZIP code. PostalCode, /// URL or hyperlink. Url, - // Network & device identifiers: - /// IP address (v4 or v6). - IpAddress, - /// MAC (hardware) address. - MacAddress, - /// Device identifier (IMEI, IDFA, etc.). - DeviceId, - /// Username or online handle. - Username, - - // Financial: + // Demographic + /// Age value. + Age, + /// Gender identity. + Gender, + /// Racial or ethnic background. + Ethnicity, + /// Religious affiliation. + Religion, + /// Nationality. + Nationality, + /// Citizenship status. + Citizenship, + /// Language or dialect spoken. + Language, + + // Financial /// Payment card number (credit or debit). PaymentCard, /// Payment card security code (CVV/CVC). @@ -75,26 +78,40 @@ pub enum EntityKind { CardExpiry, /// Bank account number. BankAccount, - /// Bank routing / transit number. + /// Bank routing or transit number. BankRouting, /// International Bank Account Number (IBAN). Iban, /// SWIFT / BIC code. SwiftCode, - /// Monetary amount. - Amount, /// Cryptocurrency wallet address. CryptoAddress, + /// Monetary amount. + Amount, - // Health: + // Health /// Medical or patient identifier. MedicalId, /// Insurance policy number. InsuranceId, /// Prescription number. PrescriptionId, + /// Medical diagnosis or condition. + Diagnosis, + /// Drug or medication name in a patient context. + Medication, + + // Biometric + /// Fingerprint template or minutiae data. + Fingerprint, + /// Voiceprint or speaker embedding. + Voiceprint, + /// Retina or iris scan data. + RetinaScan, + /// Facial geometry or face embedding (not a photo: see [`Face`](Self::Face)). + FacialGeometry, - // Credentials: + // Credentials /// Password or passphrase. Password, /// API key. @@ -104,31 +121,23 @@ pub enum EntityKind { /// Private cryptographic key. PrivateKey, - // Biometric: - /// Fingerprint template or minutiae data. - Fingerprint, - /// Voiceprint / speaker embedding. - Voiceprint, - /// Retina or iris scan data. - RetinaScan, - /// Facial geometry / face embedding (not a photo — see [`Face`](Self::Face)). - FacialGeometry, + // Network and device identifiers + /// IP address (v4 or v6). + IpAddress, + /// MAC (hardware) address. + MacAddress, + /// Device identifier (IMEI, IDFA, etc.). + DeviceId, + /// Username or online handle. + Username, - // Location: + // Location /// GPS coordinates (latitude / longitude). Coordinates, /// Geolocation metadata (EXIF, cell tower, etc.). GeolocationMetadata, - // Dates & times: - /// Date and/or time value. - DateTime, - - // Organizations: - /// Company or organisation name. - OrganizationName, - - // Visual / image entities: + // Visual /// Detected human face in an image. Face, /// Handwritten text region. @@ -139,35 +148,54 @@ pub enum EntityKind { Logo, /// Barcode (1D) or QR code (2D). Barcode, + + // Organizational + /// Company or institution name. + OrganizationName, + /// Internal division or department name. + DepartmentName, + /// Physical facility name (hospital, office, school). + FacilityName, + /// Legal or administrative case identifier. + CaseNumber, + /// Internal reference number (invoice, contract, PO, employee number, membership ID). + InternalId, + + // Temporal + /// Date, time, or datetime value. + DateTime, } impl EntityKind { /// Returns the [`EntityCategory`] this entity kind belongs to. pub fn category(&self) -> EntityCategory { match self { - // Identity & personal - Self::GovernmentId + // Personal identity + Self::PersonName + | Self::DateOfBirth + | Self::GovernmentId | Self::TaxId | Self::DriversLicense | Self::PassportNumber + | Self::NationalInsuranceNumber | Self::VehicleId - | Self::LicensePlate - | Self::PersonName - | Self::DateOfBirth - | Self::Age - | Self::Demographic => EntityCategory::Pii, + | Self::LicensePlate => EntityCategory::PersonalIdentity, // Contact Self::EmailAddress | Self::PhoneNumber | Self::Address | Self::PostalCode - | Self::Url => EntityCategory::Pii, + | Self::Url => EntityCategory::ContactInfo, - // Network & device - Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { - EntityCategory::Pii - } + // Demographic + Self::Age + | Self::Gender + | Self::Ethnicity + | Self::Religion + | Self::Nationality + | Self::Citizenship + | Self::Language => EntityCategory::Demographic, // Financial Self::PaymentCard @@ -177,35 +205,50 @@ impl EntityKind { | Self::BankRouting | Self::Iban | Self::SwiftCode - | Self::Amount - | Self::CryptoAddress => EntityCategory::Financial, + | Self::CryptoAddress + | Self::Amount => EntityCategory::Financial, // Health - Self::MedicalId | Self::InsuranceId | Self::PrescriptionId => EntityCategory::Phi, + Self::MedicalId + | Self::InsuranceId + | Self::PrescriptionId + | Self::Diagnosis + | Self::Medication => EntityCategory::Health, + + // Biometric + Self::Fingerprint | Self::Voiceprint | Self::RetinaScan | Self::FacialGeometry => { + EntityCategory::Biometric + } // Credentials Self::Password | Self::ApiKey | Self::AuthToken | Self::PrivateKey => { EntityCategory::Credentials } - // Biometric - Self::Fingerprint - | Self::Voiceprint - | Self::RetinaScan - | Self::FacialGeometry - | Self::Face => EntityCategory::Biometric, + // Network + Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { + EntityCategory::NetworkIdentifier + } // Location - Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Pii, - - // Dates & times - Self::DateTime => EntityCategory::Pii, + Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Location, - // Organizations - Self::OrganizationName => EntityCategory::Pii, + // Visual + Self::Face | Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => { + EntityCategory::Visual + } - // Visual / image - Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => EntityCategory::Pii, + // Organizational + Self::OrganizationName + | Self::DepartmentName + | Self::FacilityName + | Self::CaseNumber + | Self::InternalId => EntityCategory::Organizational, + + // Temporal (grouped under PersonalIdentity: bare dates most + // commonly appear alongside personal data and are regulated + // as PII by GDPR/CCPA) + Self::DateTime => EntityCategory::PersonalIdentity, } } @@ -215,6 +258,7 @@ impl EntityKind { // Critical: irrevocable identifiers, secrets, biometrics Self::GovernmentId | Self::PassportNumber + | Self::NationalInsuranceNumber | Self::PaymentCard | Self::CardSecurityCode | Self::BankAccount @@ -238,33 +282,46 @@ impl EntityKind { | Self::MedicalId | Self::InsuranceId | Self::PrescriptionId + | Self::Diagnosis + | Self::Medication | Self::Iban | Self::CryptoAddress | Self::Face - | Self::Signature => EntitySensitivity::High, + | Self::Signature + | Self::Coordinates => EntitySensitivity::High, // Medium: indirectly identifying Self::Age - | Self::Demographic + | Self::Gender + | Self::Ethnicity + | Self::Religion + | Self::Nationality + | Self::Citizenship + | Self::Language | Self::PostalCode | Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username - | Self::Coordinates - | Self::GeolocationMetadata | Self::CardExpiry | Self::BankRouting | Self::SwiftCode | Self::VehicleId | Self::LicensePlate + | Self::GeolocationMetadata | Self::DateTime - | Self::Handwriting => EntitySensitivity::Medium, + | Self::Handwriting + | Self::CaseNumber + | Self::InternalId => EntitySensitivity::Medium, - // Low: quasi-public - Self::Url | Self::Amount | Self::OrganizationName | Self::Logo | Self::Barcode => { - EntitySensitivity::Low - } + // Low: quasi-public or context-dependent + Self::Url + | Self::Amount + | Self::OrganizationName + | Self::DepartmentName + | Self::FacilityName + | Self::Logo + | Self::Barcode => EntitySensitivity::Low, } } } @@ -302,10 +359,38 @@ mod tests { } #[test] - fn category_pii() { - assert_eq!(EntityKind::GovernmentId.category(), EntityCategory::Pii); - assert_eq!(EntityKind::PersonName.category(), EntityCategory::Pii); - assert_eq!(EntityKind::Address.category(), EntityCategory::Pii); + fn category_personal_identity() { + assert_eq!( + EntityKind::GovernmentId.category(), + EntityCategory::PersonalIdentity + ); + assert_eq!( + EntityKind::PersonName.category(), + EntityCategory::PersonalIdentity + ); + assert_eq!( + EntityKind::DateOfBirth.category(), + EntityCategory::PersonalIdentity + ); + } + + #[test] + fn category_contact_info() { + assert_eq!( + EntityKind::EmailAddress.category(), + EntityCategory::ContactInfo + ); + assert_eq!(EntityKind::Address.category(), EntityCategory::ContactInfo); + } + + #[test] + fn category_demographic() { + assert_eq!(EntityKind::Gender.category(), EntityCategory::Demographic); + assert_eq!( + EntityKind::Ethnicity.category(), + EntityCategory::Demographic + ); + assert_eq!(EntityKind::Religion.category(), EntityCategory::Demographic); } #[test] @@ -318,23 +403,16 @@ mod tests { } #[test] - fn category_phi() { - assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Phi); - assert_eq!(EntityKind::PrescriptionId.category(), EntityCategory::Phi); + fn category_health() { + assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Health); + assert_eq!(EntityKind::Diagnosis.category(), EntityCategory::Health); + assert_eq!(EntityKind::Medication.category(), EntityCategory::Health); } #[test] fn category_credentials() { assert_eq!(EntityKind::Password.category(), EntityCategory::Credentials); assert_eq!(EntityKind::ApiKey.category(), EntityCategory::Credentials); - assert_eq!( - EntityKind::AuthToken.category(), - EntityCategory::Credentials - ); - assert_eq!( - EntityKind::PrivateKey.category(), - EntityCategory::Credentials - ); } #[test] @@ -345,11 +423,23 @@ mod tests { ); assert_eq!(EntityKind::Voiceprint.category(), EntityCategory::Biometric); assert_eq!(EntityKind::RetinaScan.category(), EntityCategory::Biometric); + assert_eq!(EntityKind::Face.category(), EntityCategory::Visual); + } + + #[test] + fn category_organizational() { assert_eq!( - EntityKind::FacialGeometry.category(), - EntityCategory::Biometric + EntityKind::OrganizationName.category(), + EntityCategory::Organizational + ); + assert_eq!( + EntityKind::CaseNumber.category(), + EntityCategory::Organizational + ); + assert_eq!( + EntityKind::InternalId.category(), + EntityCategory::Organizational ); - assert_eq!(EntityKind::Face.category(), EntityCategory::Biometric); } #[test] @@ -383,6 +473,7 @@ mod tests { EntitySensitivity::High ); assert_eq!(EntityKind::MedicalId.sensitivity(), EntitySensitivity::High); + assert_eq!(EntityKind::Diagnosis.sensitivity(), EntitySensitivity::High); } #[test] diff --git a/crates/nvisy-ontology/src/entity/location/mod.rs b/crates/nvisy-ontology/src/entity/location/mod.rs index 14a8b196..3bf5cfcf 100644 --- a/crates/nvisy-ontology/src/entity/location/mod.rs +++ b/crates/nvisy-ontology/src/entity/location/mod.rs @@ -5,13 +5,14 @@ mod image; mod tabular; mod text; -pub use audio::AudioLocation; use derive_more::From; -pub use image::ImageLocation; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use tabular::TabularLocation; -pub use text::TextLocation; + +pub use self::audio::AudioLocation; +pub use self::image::ImageLocation; +pub use self::tabular::TabularLocation; +pub use self::text::TextLocation; /// A modality-specific location for a detected entity. /// diff --git a/crates/nvisy-ontology/src/entity/method.rs b/crates/nvisy-ontology/src/entity/method.rs index 028664de..42839d46 100644 --- a/crates/nvisy-ontology/src/entity/method.rs +++ b/crates/nvisy-ontology/src/entity/method.rs @@ -1,35 +1,146 @@ -//! Detection method classification. +//! Extraction, recognition, and refinement method classification. +//! +//! These enums form the provenance record for every detected entity, +//! documenting how content was extracted from its source modality, +//! how sensitive data was identified, and what post-detection +//! refinements were applied. use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use strum::{Display, EnumString}; -/// Method used to detect a sensitive entity. +/// How content was extracted from its source modality into analyzable form. +/// +/// Each variant names the technique that transformed raw content +/// (image pixels, audio samples, binary file formats) into a +/// representation suitable for entity recognition. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] -pub enum DetectionMethod { - /// Regular expression pattern matching. +pub enum ExtractionMethod { + // Text + /// Structural parsing of document formats (PDF, DOCX, HTML) + /// into text and layout primitives. + DocumentParsing, + /// Inference of field semantics from column names, types, or + /// positional conventions in tabular data. + SchemaInference, + + // Image / Video + /// Optical character recognition: converts raster text + /// (printed or handwritten) into machine-readable characters. + OpticalCharacterRecognition, + /// Object detection: locates and labels regions of interest + /// within an image or video frame. + ObjectDetection, + /// Scene text detection: localises text embedded in natural + /// images (signs, screens, whiteboards) prior to OCR. + SceneTextDetection, + /// Table extraction: recovers row/column structure from images + /// or scanned PDFs, preserving cell relationships that plain + /// OCR loses. + TableExtraction, + /// Document layout analysis: identifies structural regions + /// (headers, footers, signature blocks, form fields) by spatial + /// arrangement rather than content. + LayoutAnalysis, + /// Metadata extraction: reads EXIF, PDF properties, or other + /// embedded metadata that may contain PII (author, GPS, device info). + MetadataExtraction, + /// Frame extraction: samples individual frames from video + /// streams for downstream image analysis. + FrameExtraction, + + // Audio / Video + /// Speech-to-text transcription: converts audio into text. + Transcription, + /// Speaker diarization: segments audio by speaker identity + /// to attribute utterances before recognition. + Diarization, +} + +/// Technique used to identify a sensitive entity within extracted content. +/// +/// Each variant names a self-contained recognition strategy. +/// An entity's `recognition_methods` vector records every technique +/// that contributed to its identification, ordered by application time. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum RecognitionMethod { + // Pattern + /// Regular expression matching against known PII formats. Regex, - /// Lookup in a known-value dictionary. + /// Mathematical validation of a candidate value + /// (Luhn, IBAN mod-97, SSN area rules). + Checksum, + /// Exact-match lookup in a curated value list. Dictionary, - /// Named-entity recognition via AI model. + /// Co-occurrence analysis: keywords near a candidate raise or + /// lower confidence (e.g. "SSN" adjacent to a 9-digit number). + ContextualAnalysis, + /// Format heuristics: entropy, character distribution, or + /// structural cues that suggest a value is sensitive without + /// an explicit regex. + Heuristic, + + // Model + /// Named-entity recognition via language model. Ner, - /// Contextual NLP analysis (discourse-level understanding). - ContextualNlp, - /// OCR text extraction with bounding boxes. - Ocr, - /// Face detection in images. - FaceDetection, - /// Object detection in images. - ObjectDetection, - /// Entity detection from speech transcription. - SpeechTranscript, - /// Speaker-identified audio segment for redaction. - SpeakerRedaction, - /// Multiple methods combined to produce a single detection. - Composite, - /// User-provided annotations. + /// Document or field-level classification + /// (e.g. "this column contains SSNs"). + Classification, + /// Semantic similarity search via vector embeddings. + Embedding, + /// Matching extracted values against an external identity or + /// record database. + CrossReference, + + // Biometric + /// Biometric identification: face recognition, voiceprint + /// matching, or other physiological/behavioral trait analysis. + Biometric, + + // Human + /// User-provided annotation. Manual, } + +/// Post-detection refinement applied to an entity before final output. +/// +/// Refinement methods do not discover new entities: they adjust +/// confidence, merge duplicates, or verify existing detections. +/// Recorded on the entity to explain why its final state may differ +/// from the initial detection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum RefinementMethod { + /// Cross-detector deduplication: merges overlapping entities + /// from independent detectors, combining their confidence and + /// attribution. + Deduplication, + /// Ensemble fusion: combines confidence scores from multiple + /// detectors using a voting or averaging strategy. + EnsembleFusion, + /// Model-based verification: a secondary model (typically VLM) + /// reviews detections against source content to confirm, correct, + /// or reject. + ModelVerification, + /// Policy evaluation: applies business rules, thresholds, or + /// per-category overrides to filter or re-score detections. + PolicyEvaluation, + /// Human review: a reviewer confirmed, corrected, or rejected + /// the detection. + HumanReview, + /// Confidence calibration: adjusts raw model scores to align + /// with empirical precision targets. + ConfidenceCalibration, + /// Contextual promotion/demotion: surrounding document context + /// upgrades or downgrades an entity's confidence after initial + /// detection. + ContextualAdjustment, +} diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index 84c9404f..6fa6f6e8 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -13,20 +13,21 @@ mod model; mod output; mod sensitivity; -pub use annotation::{Annotation, AnnotationKind, AnnotationLabel, AnnotationScope}; -pub use category::EntityCategory; use derive_more::{Deref, DerefMut, From, IntoIterator}; -pub use kind::EntityKind; -pub use location::{AudioLocation, ImageLocation, Location, TabularLocation, TextLocation}; -pub use method::DetectionMethod; -pub use model::{ModelInfo, ModelKind}; use nvisy_core::content::ContentSource; -pub use output::DetectionOutput; use schemars::JsonSchema; -pub use sensitivity::EntitySensitivity; use serde::{Deserialize, Serialize}; use uuid::Uuid; +pub use self::annotation::{Annotation, AnnotationKind, AnnotationLabel, AnnotationScope}; +pub use self::category::EntityCategory; +pub use self::kind::EntityKind; +pub use self::location::{AudioLocation, ImageLocation, Location, TabularLocation, TextLocation}; +pub use self::method::{ExtractionMethod, RecognitionMethod, RefinementMethod}; +pub use self::model::{ModelInfo, ModelKind}; +pub use self::output::DetectionOutput; +pub use self::sensitivity::EntitySensitivity; + /// A detected sensitive data occurrence within a document. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] @@ -40,8 +41,14 @@ pub struct Entity { pub entity_kind: EntityKind, /// The matched text or value. pub value: String, - /// How this entity was detected. - pub detection_method: DetectionMethod, + /// How content was extracted from its source modality, ordered by application time. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub extraction_methods: Vec, + /// Techniques used to identify this entity, ordered by application time. + pub recognition_methods: Vec, + /// Post-detection refinements applied to this entity, ordered by application time. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub refinement_methods: Vec, /// Detection confidence score in the range `[0.0, 1.0]`. pub confidence: f64, /// Modality-specific location of the entity. @@ -61,12 +68,15 @@ impl Entity { self.source.as_uuid() } - /// Create a new entity with the given detection details. + /// Create a new entity with the given recognition method and confidence. + /// + /// The `category` is derived from `entity_kind` via + /// [`EntityKind::category()`] when not supplied explicitly. pub fn new( category: EntityCategory, entity_kind: EntityKind, value: impl Into, - detection_method: DetectionMethod, + recognition_method: RecognitionMethod, confidence: f64, ) -> Self { Self { @@ -74,7 +84,9 @@ impl Entity { category, entity_kind, value: value.into(), - detection_method, + extraction_methods: Vec::new(), + recognition_methods: vec![recognition_method], + refinement_methods: Vec::new(), confidence, location: None, language: None, @@ -82,6 +94,22 @@ impl Entity { } } + /// Create a new entity, deriving the category from the entity kind. + pub fn from_kind( + entity_kind: EntityKind, + value: impl Into, + recognition_method: RecognitionMethod, + confidence: f64, + ) -> Self { + Self::new( + entity_kind.category(), + entity_kind, + value, + recognition_method, + confidence, + ) + } + /// Set the modality-specific location on this entity. pub fn with_location(mut self, location: Location) -> Self { self.location = Some(location); @@ -151,11 +179,20 @@ impl Entities { .collect() } - /// Retain only entities matching the given detection method. - pub fn by_method(&self, method: DetectionMethod) -> Self { + /// Retain only entities that were recognised (at least partly) by the given method. + pub fn by_recognition_method(&self, method: RecognitionMethod) -> Self { + self.0 + .iter() + .filter(|e| e.recognition_methods.contains(&method)) + .cloned() + .collect() + } + + /// Retain only entities whose content was extracted by the given method. + pub fn by_extraction_method(&self, method: ExtractionMethod) -> Self { self.0 .iter() - .filter(|e| e.detection_method == method) + .filter(|e| e.extraction_methods.contains(&method)) .cloned() .collect() } diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-ontology/src/policy/mod.rs index 39696ffc..de53365f 100644 --- a/crates/nvisy-ontology/src/policy/mod.rs +++ b/crates/nvisy-ontology/src/policy/mod.rs @@ -7,9 +7,9 @@ mod strategy; mod summary; mod types; -pub use retention::{Retention, RetentionPolicy, RetentionScope}; -pub use rule::{PolicyRule, RuleAction, RuleCondition}; -pub use selector::EntitySelector; -pub use strategy::{AudioStrategy, ImageStrategy, Strategy, TextStrategy}; -pub use summary::RedactionSummary; -pub use types::{Policies, Policy}; +pub use self::retention::{Retention, RetentionPolicy, RetentionScope}; +pub use self::rule::{PolicyRule, RuleAction, RuleCondition}; +pub use self::selector::EntitySelector; +pub use self::strategy::{AudioStrategy, ImageStrategy, Strategy, TextStrategy}; +pub use self::summary::RedactionSummary; +pub use self::types::{Policies, Policy}; diff --git a/crates/nvisy-ontology/src/policy/strategy/mod.rs b/crates/nvisy-ontology/src/policy/strategy/mod.rs index a43dd0e1..a75968c8 100644 --- a/crates/nvisy-ontology/src/policy/strategy/mod.rs +++ b/crates/nvisy-ontology/src/policy/strategy/mod.rs @@ -9,12 +9,13 @@ mod audio; mod image; mod text; -pub use audio::AudioStrategy; use derive_more::From; -pub use image::ImageStrategy; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -pub use text::TextStrategy; + +pub use self::audio::AudioStrategy; +pub use self::image::ImageStrategy; +pub use self::text::TextStrategy; /// Unified redaction strategy across all modalities. /// diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index 1d2b4428..92a1eab4 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -2,7 +2,7 @@ pub use crate::context::{Context, ContextEntry, ContextEntryData}; pub use crate::entity::{ - Annotation, AnnotationKind, DetectionMethod, DetectionOutput, Entities, Entity, EntityCategory, - EntityKind, EntitySensitivity, Location, + Annotation, AnnotationKind, DetectionOutput, Entities, Entity, EntityCategory, EntityKind, + EntitySensitivity, ExtractionMethod, Location, RecognitionMethod, RefinementMethod, }; pub use crate::policy::{Policies, Policy, PolicyRule, Strategy}; diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 18ebacbb..34ce0493 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -44,3 +44,6 @@ aho-corasick = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index ee38370d..139c173d 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -17,7 +17,7 @@ Detection runs in three phases: dictionary are injected as synthetic matches with confidence `1.0`. Allow-list filtering is applied inline during phases 1 and 2. All three phases -feed into a unified `Vec`. +feed into a unified `Vec`. ### Pattern JSON schema @@ -27,7 +27,7 @@ Patterns are JSON definition files embedded at compile time from ```json { "name": "ssn", - "category": "pii", + "category": "personal_identity", "entity_type": "government_id", "pattern": { "regex": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", @@ -56,15 +56,15 @@ Patterns are JSON definition files embedded at compile time from | Field | Type | Default | Description | |-------|------|---------|-------------| -| `regex` | string | — | Regular expression string | -| `validator` | string | — | Post-match validator name resolved via `ValidatorResolver` | +| `regex` | string | required | Regular expression string | +| `validator` | string | none | Post-match validator name resolved via `ValidatorResolver` | | `case_sensitive` | bool | `false` | Whether matching is case-sensitive | ### `dictionary` object (dictionary match source) | Field | Type | Default | Description | |-------|------|---------|-------------| -| `name` | string | — | Named dictionary from `DictionaryRegistry` | +| `name` | string | required | Named dictionary from `DictionaryRegistry` | | `case_sensitive` | bool | `false` | Whether matching is case-sensitive | ### Context rule (co-occurrence scoring) @@ -76,7 +76,7 @@ increased by `boost`, clamped to `[0.0, 1.0]`. | Field | Type | Default | Description | |-------|------|---------|-------------| -| `keywords` | string[] | — | Strings to search for in nearby spans | +| `keywords` | string[] | required | Strings to search for in nearby spans | | `window` | int | `3` | Number of spans before/after the match to examine | | `boost` | float | `0.1` | Confidence increase when a keyword is found | | `case_sensitive` | bool | `false` | Whether keyword matching is case-sensitive | @@ -88,36 +88,37 @@ adjacent spans. ## Allow/deny lists -The `PatternEngineBuilder` supports exact-match allow and deny lists via the -[`AllowList`] and [`DenyList`] types: +Allow and deny lists are configured per-scan via [`ScanContext`], not on the +engine itself: ```rust,ignore -let allow = AllowList::new() - .with("123-45-6789") // suppress known test SSN - .with("000-00-0000"); - -let deny = DenyList::new() - .with("John Doe", EntityCategory::Pii, EntityKind::PersonName); - -let engine = PatternEngine::builder() - .with_allow(allow) - .with_deny(deny) - .build()?; +use nvisy_pattern::prelude::*; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; + +let ctx = ScanContext::new() + .with_allow(AllowList::new() + .with("123-45-6789") // suppress known test SSN + .with("000-00-0000")) + .with_deny(DenyList::new() + .with("John Doe", DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + })); + +let matches = PatternEngine::instance().scan_text("...", &ctx); ``` - **Allow list** (`AllowList`): matched values that appear in the allow list are silently dropped during `scan_text`. - **Deny list** (`DenyList`): if a deny-list value is found in the text but was not matched by any regex or dictionary pattern, it is injected as a - synthetic `PatternMatch` with confidence `1.0` and source - `DetectionSource::DenyList`. - -Both types implement `FromIterator` for easy construction from iterators. + synthetic `RawMatch` with confidence `1.0` and `pattern_name: None`. ## Validators Validators are post-match checks resolved by name through `ValidatorResolver`. -Regex patterns reference a validator by name in their `pattern.validator` field; +Regex patterns reference a validator by name in their `pattern.validator` field: the engine runs the validator on each raw match and drops values that fail. ## Documentation diff --git a/crates/nvisy-pattern/assets/patterns/date_of_birth.json b/crates/nvisy-pattern/assets/patterns/date_of_birth.json index 26ecd523..18ff6058 100644 --- a/crates/nvisy-pattern/assets/patterns/date_of_birth.json +++ b/crates/nvisy-pattern/assets/patterns/date_of_birth.json @@ -1,6 +1,6 @@ { "name": "date-of-birth", - "category": "pii", + "category": "personal_identity", "entity_type": "date_of_birth", "pattern": { "regex": "\\b(?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[/\\-](?:19|20)\\d{2}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/email.json b/crates/nvisy-pattern/assets/patterns/email.json index 87485389..eee1fb7f 100644 --- a/crates/nvisy-pattern/assets/patterns/email.json +++ b/crates/nvisy-pattern/assets/patterns/email.json @@ -1,6 +1,6 @@ { "name": "email", - "category": "pii", + "category": "contact_info", "entity_type": "email_address", "pattern": { "regex": "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/ipv4.json b/crates/nvisy-pattern/assets/patterns/ipv4.json index 971ccd91..c6358231 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv4.json +++ b/crates/nvisy-pattern/assets/patterns/ipv4.json @@ -1,6 +1,6 @@ { "name": "ipv4", - "category": "pii", + "category": "network_identifier", "entity_type": "ip_address", "pattern": { "regex": "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b", diff --git a/crates/nvisy-pattern/assets/patterns/ipv6.json b/crates/nvisy-pattern/assets/patterns/ipv6.json index ce096fd2..82e7f20c 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv6.json +++ b/crates/nvisy-pattern/assets/patterns/ipv6.json @@ -1,6 +1,6 @@ { "name": "ipv6", - "category": "pii", + "category": "network_identifier", "entity_type": "ip_address", "pattern": { "regex": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/languages.json b/crates/nvisy-pattern/assets/patterns/languages.json index 5d460af0..9b4d52e5 100644 --- a/crates/nvisy-pattern/assets/patterns/languages.json +++ b/crates/nvisy-pattern/assets/patterns/languages.json @@ -1,7 +1,7 @@ { "name": "languages", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "language", "dictionary": { "name": "languages", "confidence": [0.85, 0.45] diff --git a/crates/nvisy-pattern/assets/patterns/mac_address.json b/crates/nvisy-pattern/assets/patterns/mac_address.json index fd8fe8eb..8d62b60f 100644 --- a/crates/nvisy-pattern/assets/patterns/mac_address.json +++ b/crates/nvisy-pattern/assets/patterns/mac_address.json @@ -1,6 +1,6 @@ { "name": "mac-address", - "category": "pii", + "category": "network_identifier", "entity_type": "mac_address", "pattern": { "regex": "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/nationalities.json b/crates/nvisy-pattern/assets/patterns/nationalities.json index a32593c0..bec7c867 100644 --- a/crates/nvisy-pattern/assets/patterns/nationalities.json +++ b/crates/nvisy-pattern/assets/patterns/nationalities.json @@ -1,7 +1,7 @@ { "name": "nationalities", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "nationality", "dictionary": { "name": "nationalities", "confidence": 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/phone.json b/crates/nvisy-pattern/assets/patterns/phone.json index 5380e94d..484cc5b5 100644 --- a/crates/nvisy-pattern/assets/patterns/phone.json +++ b/crates/nvisy-pattern/assets/patterns/phone.json @@ -1,6 +1,6 @@ { "name": "phone", - "category": "pii", + "category": "contact_info", "entity_type": "phone_number", "pattern": { "regex": "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/religions.json b/crates/nvisy-pattern/assets/patterns/religions.json index bb3d2f2b..cf038496 100644 --- a/crates/nvisy-pattern/assets/patterns/religions.json +++ b/crates/nvisy-pattern/assets/patterns/religions.json @@ -1,7 +1,7 @@ { "name": "religions", - "category": "pii", - "entity_type": "demographic", + "category": "demographic", + "entity_type": "religion", "dictionary": { "name": "religions", "confidence": 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/ssn.json b/crates/nvisy-pattern/assets/patterns/ssn.json index 12aeb750..21c887f5 100644 --- a/crates/nvisy-pattern/assets/patterns/ssn.json +++ b/crates/nvisy-pattern/assets/patterns/ssn.json @@ -1,6 +1,6 @@ { "name": "ssn", - "category": "pii", + "category": "personal_identity", "entity_type": "government_id", "pattern": { "regex": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", diff --git a/crates/nvisy-pattern/assets/patterns/url.json b/crates/nvisy-pattern/assets/patterns/url.json index d7bebc56..6e9907fa 100644 --- a/crates/nvisy-pattern/assets/patterns/url.json +++ b/crates/nvisy-pattern/assets/patterns/url.json @@ -1,6 +1,6 @@ { "name": "url", - "category": "pii", + "category": "contact_info", "entity_type": "url", "pattern": { "regex": "\\bhttps?://[^\\s/$.?#][^\\s]*\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json index 1c1709a4..fc39bdcf 100644 --- a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json +++ b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json @@ -1,6 +1,6 @@ { "name": "us-drivers-license", - "category": "pii", + "category": "personal_identity", "entity_type": "drivers_license", "pattern": { "regex": "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_passport.json b/crates/nvisy-pattern/assets/patterns/us_passport.json index bf055a8f..0e0c5c57 100644 --- a/crates/nvisy-pattern/assets/patterns/us_passport.json +++ b/crates/nvisy-pattern/assets/patterns/us_passport.json @@ -1,6 +1,6 @@ { "name": "us-passport", - "category": "pii", + "category": "personal_identity", "entity_type": "passport_number", "pattern": { "regex": "\\b[A-Z]\\d{8}\\b", diff --git a/crates/nvisy-pattern/assets/patterns/us_postal_code.json b/crates/nvisy-pattern/assets/patterns/us_postal_code.json index b6269565..33c7c8d0 100644 --- a/crates/nvisy-pattern/assets/patterns/us_postal_code.json +++ b/crates/nvisy-pattern/assets/patterns/us_postal_code.json @@ -1,6 +1,6 @@ { "name": "us-postal-code", - "category": "pii", + "category": "contact_info", "entity_type": "postal_code", "pattern": { "regex": "\\b\\d{5}(?:-\\d{4})?\\b", diff --git a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs index 96e55a98..e2baa979 100644 --- a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs @@ -1,17 +1,20 @@ -//! CSV dictionary: one row per entity, each cell is a matchable variant. +//! CSV dictionary: one row per entity, each cell becomes a matchable variant. -use super::Dictionary; +use std::path::Path; + +use super::{CsvDictionaryError, Dictionary, DictionaryLoadError, DictionaryTerm}; /// A dictionary parsed from a CSV file. /// /// Each row may contain multiple columns (e.g. name, symbol, code). -/// Every non-empty cell becomes a matchable term. -#[derive(Debug, Clone)] +/// Every non-empty cell becomes a matchable term whose [`column`] +/// records which CSV column it came from. +/// +/// [`column`]: DictionaryTerm::column +#[derive(Debug)] pub struct CsvDictionary { name: String, - entries: Vec, - /// Source column index for each entry (parallel to `entries`). - columns: Vec, + terms: Vec, } impl CsvDictionary { @@ -21,11 +24,14 @@ impl CsvDictionary { /// `text` is the CSV content where each non-empty cell becomes a matchable term. /// The column index of each cell is preserved so that per-column confidence /// scores can be applied at detection time. - pub fn new(name: impl Into, text: &str) -> Self { + /// + /// # Errors + /// + /// Returns [`CsvDictionaryError`] if any CSV record cannot be parsed. + pub fn new(name: impl Into, text: &str) -> Result { let name = name.into(); - let mut entries = Vec::new(); - let mut columns = Vec::new(); + let mut terms = Vec::new(); let mut reader = csv::ReaderBuilder::new() .has_headers(false) .flexible(true) @@ -33,21 +39,46 @@ impl CsvDictionary { .from_reader(text.as_bytes()); for result in reader.records() { - let record = result.expect("failed to parse CSV record"); + let record = result.map_err(|source| CsvDictionaryError { + name: name.clone(), + source, + })?; for (col, field) in record.iter().enumerate() { - let trimmed = field.trim(); - if !trimmed.is_empty() { - entries.push(trimmed.to_owned()); - columns.push(col); + if !field.is_empty() { + terms.push(DictionaryTerm { + value: field.to_owned(), + column: Some(col as u32), + }); } } } - Self { - name, - entries, - columns, - } + Ok(Self { name, terms }) + } + + /// Load a CSV dictionary from a file path. + /// + /// The dictionary name is derived from the file stem. + /// + /// # Errors + /// + /// Returns [`DictionaryLoadError`] if the file cannot be read or + /// the CSV content cannot be parsed. + pub fn from_path(path: impl AsRef) -> Result { + let path = path.as_ref(); + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or_default(); + let text = + std::fs::read_to_string(path).map_err(|source| DictionaryLoadError::ReadFile { + path: path.to_owned(), + source, + })?; + Self::new(name, &text).map_err(|source| DictionaryLoadError::CsvParse { + path: path.to_owned(), + source, + }) } } @@ -56,12 +87,8 @@ impl Dictionary for CsvDictionary { &self.name } - fn entries(&self) -> &[String] { - &self.entries - } - - fn columns(&self) -> Option<&[usize]> { - Some(&self.columns) + fn terms(&self) -> &[DictionaryTerm] { + &self.terms } } @@ -71,20 +98,31 @@ mod tests { #[test] fn parses_rows_with_variants() { - let dict = CsvDictionary::new("test", "US Dollar,USD\nEuro,EUR\n"); + let dict = CsvDictionary::new("test", "US Dollar,USD\nEuro,EUR\n").unwrap(); assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["US Dollar", "USD", "Euro", "EUR"]); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["US Dollar", "USD", "Euro", "EUR"]); } #[test] fn handles_variable_columns() { - let dict = CsvDictionary::new("test", "a,b,c\nd,e\n"); - assert_eq!(dict.entries(), &["a", "b", "c", "d", "e"]); + let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["a", "b", "c", "d", "e"]); } #[test] fn skips_empty_fields() { - let dict = CsvDictionary::new("test", "a,,b\n"); - assert_eq!(dict.entries(), &["a", "b"]); + let dict = CsvDictionary::new("test", "a,,b\n").unwrap(); + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["a", "b"]); + } + + #[test] + fn column_indices_are_tracked() { + let dict = CsvDictionary::new("test", "a,b,c\nd,e\n").unwrap(); + let columns: Vec> = dict.terms().iter().map(|t| t.column).collect(); + assert_eq!(columns, &[Some(0), Some(1), Some(2), Some(0), Some(1)]); } } diff --git a/crates/nvisy-pattern/src/dictionaries/csv_error.rs b/crates/nvisy-pattern/src/dictionaries/csv_error.rs new file mode 100644 index 00000000..9db84f9c --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/csv_error.rs @@ -0,0 +1,19 @@ +//! Error type for CSV dictionary parsing. + +use nvisy_core::{Error, ErrorKind}; + +/// Error returned when a CSV dictionary cannot be parsed. +#[derive(Debug, thiserror::Error)] +#[error("failed to parse CSV record in dictionary '{name}': {source}")] +pub struct CsvDictionaryError { + pub(crate) name: String, + pub(crate) source: csv::Error, +} + +impl From for Error { + fn from(err: CsvDictionaryError) -> Self { + Error::new(ErrorKind::Validation, err.to_string()) + .with_component("nvisy-pattern::dictionaries") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary.rs b/crates/nvisy-pattern/src/dictionaries/dictionary.rs index 8edc63f4..e32f882b 100644 --- a/crates/nvisy-pattern/src/dictionaries/dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/dictionary.rs @@ -1,12 +1,27 @@ -//! Core [`Dictionary`] trait and [`BoxDictionary`] alias. +//! Core [`Dictionary`] trait, [`DictionaryTerm`], and [`BoxDictionary`] type alias. + +/// A single matchable term within a [`Dictionary`]. +/// +/// Each term carries its matched value and, for multi-column sources like +/// CSV files, the column index it originated from. Plain-text dictionaries +/// leave `column` as `None` (logically equivalent to column 0). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DictionaryTerm { + /// The matchable text value. + pub value: String, + /// Source column index for CSV dictionaries. + /// + /// `None` for plain-text dictionaries where column position is + /// not meaningful. + pub column: Option, +} /// A named collection of matchable terms (e.g. nationalities, currencies). /// -/// Implementors load their entries from an asset file at compile time. /// Two built-in implementations are provided: /// /// - [`TxtDictionary`]: plain text, one entry per line. -/// - [`CsvDictionary`]: CSV, each cell is a term. +/// - [`CsvDictionary`]: CSV, each cell is a term with its column index. /// /// [`TxtDictionary`]: super::TxtDictionary /// [`CsvDictionary`]: super::CsvDictionary @@ -15,16 +30,7 @@ pub trait Dictionary: Send + Sync { fn name(&self) -> &str; /// All matchable terms produced by this dictionary. - fn entries(&self) -> &[String]; - - /// Column index for each entry, parallel to [`entries`](Self::entries). - /// - /// Returns `Some` for CSV dictionaries where each cell tracks its - /// source column. Returns `None` for plain-text dictionaries (all - /// entries are logically in column 0). - fn columns(&self) -> Option<&[usize]> { - None - } + fn terms(&self) -> &[DictionaryTerm]; } /// Type-erased boxed [`Dictionary`]. diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs b/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs new file mode 100644 index 00000000..dc34f40b --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/dictionary_error.rs @@ -0,0 +1,42 @@ +//! Error type for dictionary filesystem loading. + +use nvisy_core::{Error, ErrorKind}; + +use super::CsvDictionaryError; + +/// Error returned when loading dictionaries from the filesystem. +#[derive(Debug, thiserror::Error)] +pub enum DictionaryLoadError { + /// The directory could not be read. + #[error("failed to read dictionary directory '{}': {source}", path.display())] + ReadDir { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A dictionary file could not be read. + #[error("failed to read dictionary file '{}': {source}", path.display())] + ReadFile { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A CSV dictionary file failed to parse. + #[error("failed to parse CSV dictionary '{}': {source}", path.display())] + CsvParse { + path: std::path::PathBuf, + source: CsvDictionaryError, + }, +} + +impl From for Error { + fn from(err: DictionaryLoadError) -> Self { + let kind = match &err { + DictionaryLoadError::ReadDir { .. } | DictionaryLoadError::ReadFile { .. } => { + ErrorKind::Internal + } + DictionaryLoadError::CsvParse { .. } => ErrorKind::Validation, + }; + Error::new(kind, err.to_string()) + .with_component("nvisy-pattern::dictionaries") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs new file mode 100644 index 00000000..27c53185 --- /dev/null +++ b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs @@ -0,0 +1,326 @@ +//! [`DictionaryRegistry`]: named dictionary collection with O(log n) lookup. + +use std::collections::BTreeMap; +use std::path::Path; +use std::sync::LazyLock; + +use include_dir::{Dir, include_dir}; + +use super::{BoxDictionary, CsvDictionary, Dictionary, DictionaryLoadError, TxtDictionary}; + +const TARGET: &str = "nvisy_pattern::dictionaries"; + +/// A registry of named [`Dictionary`] instances with O(log n) lookup. +/// +/// Use [`load_builtins`] to create a registry pre-populated with +/// the compile-time-embedded dictionary files, or [`load_dir`] to +/// load from a filesystem directory at runtime. +/// +/// [`load_builtins`]: Self::load_builtins +/// [`load_dir`]: Self::load_dir +#[derive(Default)] +pub struct DictionaryRegistry { + inner: BTreeMap, +} + +impl std::fmt::Debug for DictionaryRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); + f.debug_struct("DictionaryRegistry") + .field("len", &self.inner.len()) + .field("names", &names) + .finish() + } +} + +impl DictionaryRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self::default() + } + + /// Insert a dictionary, keyed by its [`Dictionary::name`]. + pub fn insert(&mut self, dict: BoxDictionary) { + let name = dict.name().to_owned(); + self.inner.insert(name, dict); + } + + /// Look up a dictionary by name. + #[must_use] + pub fn get(&self, name: &str) -> Option<&dyn Dictionary> { + self.inner.get(name).map(|b| b.as_ref()) + } + + /// Iterate over all registered dictionaries as `(name, &dyn Dictionary)` pairs. + pub fn iter(&self) -> impl Iterator { + self.inner.iter().map(|(k, v)| (k.as_str(), v.as_ref())) + } + + /// Iterate over all registered dictionary names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) + } + + /// Total number of registered dictionaries. + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Whether the registry contains no dictionaries. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Load all `.txt` and `.csv` files from the embedded + /// `assets/dictionaries/` directory into this registry. + /// + /// Unrecognised file extensions are logged as warnings and skipped. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_builtins", skip(self), fields(count))] + pub fn load_builtins(&mut self) { + static DICT_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/dictionaries"); + + for file in DICT_DIR.files() { + let path = file.path(); + let text = file + .contents_utf8() + .expect("dictionary file is not valid UTF-8"); + + let name = path + .file_stem() + .expect("dictionary path has no file stem") + .to_string_lossy(); + + let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { + Some("txt") => Box::new(TxtDictionary::new(name.as_ref(), text)), + Some("csv") => Box::new( + CsvDictionary::new(name.as_ref(), text) + .expect("built-in CSV dictionary must parse"), + ), + other => { + tracing::warn!( + target: TARGET, + path = %path.display(), + extension = ?other, + "skipping unrecognised dictionary file", + ); + continue; + } + }; + + tracing::trace!( + target: TARGET, + name = dict.name(), + terms = dict.terms().len(), + "dictionary loaded", + ); + self.insert(dict); + } + + tracing::Span::current().record("count", self.len()); + tracing::debug!(target: TARGET, "built-in dictionaries loaded"); + } + + /// Load a single `.txt` or `.csv` dictionary file and insert it. + /// + /// The dictionary name is derived from the file stem. + /// Files with unrecognised extensions are logged as warnings and + /// ignored (no error is returned). + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the file cannot be read or + /// a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_file", skip_all, fields(path = %path.as_ref().display()))] + pub fn load_file(&mut self, path: impl AsRef) -> nvisy_core::Result<()> { + let path = path.as_ref(); + + let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { + Some("txt") => { + let d = TxtDictionary::from_path(path).map_err(|source| { + DictionaryLoadError::ReadFile { + path: path.to_owned(), + source, + } + })?; + Box::new(d) + } + Some("csv") => Box::new(CsvDictionary::from_path(path)?), + other => { + tracing::warn!( + target: TARGET, + path = %path.display(), + extension = ?other, + "skipping unrecognised dictionary file", + ); + return Ok(()); + } + }; + + tracing::trace!( + target: TARGET, + name = dict.name(), + terms = dict.terms().len(), + "dictionary loaded from filesystem", + ); + self.insert(dict); + Ok(()) + } + + /// Load all `.txt` and `.csv` files from a filesystem directory. + /// + /// Files with unrecognised extensions are logged as warnings and + /// skipped. Loaded dictionaries are inserted into `self`, so this + /// can be called after [`load_builtins`](Self::load_builtins) to + /// layer user-provided dictionaries on top of the built-ins. + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the directory cannot be read, + /// a file cannot be read, or a CSV file fails to parse. + #[tracing::instrument(target = TARGET, name = "dictionaries.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] + pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { + let dir = dir.as_ref(); + + let entries = std::fs::read_dir(dir).map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + + let mut count = 0usize; + for entry in entries { + let entry = entry.map_err(|source| DictionaryLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + let path = entry.path(); + + if !path.is_file() { + continue; + } + + self.load_file(&path)?; + count += 1; + } + + tracing::Span::current().record("count", count); + tracing::debug!(target: TARGET, "filesystem dictionaries loaded"); + Ok(()) + } +} + +static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { + let mut reg = DictionaryRegistry::new(); + reg.load_builtins(); + reg +}); + +/// Return a reference to the lazily-initialised built-in [`DictionaryRegistry`]. +pub fn builtin_registry() -> &'static DictionaryRegistry { + &BUILTIN_REGISTRY +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::*; + + fn registry() -> &'static DictionaryRegistry { + builtin_registry() + } + + #[test] + fn builtins_load_and_are_nonempty() { + let reg = registry(); + assert!(!reg.is_empty()); + for (_, dict) in reg.iter() { + assert!(!dict.terms().is_empty(), "{} is empty", dict.name()); + } + } + + #[test] + fn terms_are_trimmed_and_nonempty() { + for (_, dict) in registry().iter() { + let name = dict.name(); + for term in dict.terms() { + assert!(!term.value.is_empty(), "empty term in {name}"); + assert_eq!( + term.value, + term.value.trim(), + "untrimmed term in {name}: {:?}", + term.value, + ); + } + } + } + + #[test] + fn no_duplicate_terms_per_dictionary() { + for (_, dict) in registry().iter() { + let mut seen = HashSet::new(); + for term in dict.terms() { + assert!( + seen.insert(term.value.as_str()), + "duplicate term {:?} in dictionary {}", + term.value, + dict.name(), + ); + } + } + } + + #[test] + fn registry_names_are_sorted() { + let keys: Vec<&str> = registry().names().collect(); + let mut sorted = keys.clone(); + sorted.sort(); + assert_eq!(keys, sorted); + } + + #[test] + fn registry_insert_and_get() { + let mut reg = DictionaryRegistry::new(); + let dict: BoxDictionary = Box::new(TxtDictionary::new("test", "foo\nbar\n")); + reg.insert(dict); + + assert_eq!(reg.len(), 1); + + let dict = reg.get("test").unwrap(); + assert_eq!(dict.name(), "test"); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["foo", "bar"]); + } + + #[test] + fn load_dir_reads_filesystem() { + let dir = tempfile::tempdir().unwrap(); + + std::fs::write(dir.path().join("colors.txt"), "red\nblue\ngreen\n").unwrap(); + std::fs::write(dir.path().join("sizes.csv"), "small,S\nmedium,M\nlarge,L\n").unwrap(); + // Should be skipped. + std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); + + let mut reg = DictionaryRegistry::new(); + reg.load_dir(dir.path()).unwrap(); + + assert_eq!(reg.len(), 2); + + let colors = reg.get("colors").unwrap(); + let color_values: Vec<&str> = colors.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(color_values, &["red", "blue", "green"]); + + let sizes = reg.get("sizes").unwrap(); + let size_values: Vec<&str> = sizes.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(size_values, &["small", "S", "medium", "M", "large", "L"]); + } + + #[test] + fn load_dir_missing_directory() { + let mut reg = DictionaryRegistry::new(); + let result = reg.load_dir("/nonexistent/path"); + assert!(result.is_err()); + } +} diff --git a/crates/nvisy-pattern/src/dictionaries/mod.rs b/crates/nvisy-pattern/src/dictionaries/mod.rs index 6c5d5ba6..176843c2 100644 --- a/crates/nvisy-pattern/src/dictionaries/mod.rs +++ b/crates/nvisy-pattern/src/dictionaries/mod.rs @@ -1,19 +1,19 @@ //! Built-in dictionaries for entity matching. //! //! Dictionaries are asset files under `assets/dictionaries/` containing -//! matchable terms (nationalities, religions, currencies, etc.). They are +//! matchable terms (nationalities, religions, currencies, etc.). They are //! embedded at compile time and loaded lazily on first access. //! //! Two file formats are supported: //! //! - **Plain text** (`.txt`): one entry per line, see [`TxtDictionary`]. -//! - **CSV** (`.csv`): each row holds variants of a single entity -//! (e.g. `US Dollar,USD`), see [`CsvDictionary`]. +//! - **CSV** (`.csv`): each row holds variants of a single entity (e.g. +//! `US Dollar,USD`), see [`CsvDictionary`]. //! //! # Key types //! //! - [`Dictionary`]: trait implemented by every dictionary. -//! - [`DictionaryRegistry`]: sorted collection with O(log n) lookup by name. +//! - [`DictionaryRegistry`]: sorted collection with O(log n) lookup. //! //! [`TxtDictionary`]: crate::dictionaries::TxtDictionary //! [`CsvDictionary`]: crate::dictionaries::CsvDictionary @@ -21,171 +21,15 @@ //! [`DictionaryRegistry`]: crate::dictionaries::DictionaryRegistry mod csv_dictionary; +mod csv_error; mod dictionary; +mod dictionary_error; +mod dictionary_registry; mod text_dictionary; -use std::collections::BTreeMap; -use std::sync::LazyLock; - -pub use csv_dictionary::CsvDictionary; -pub use dictionary::{BoxDictionary, Dictionary}; -use include_dir::{Dir, include_dir}; -pub use text_dictionary::TxtDictionary; - -/// A registry of named [`Dictionary`] instances with O(log n) lookup. -/// -/// Use [`load_builtins`] to create a registry pre-populated with -/// the compile-time-embedded dictionary files. -/// -/// [`load_builtins`]: Self::load_builtins -pub struct DictionaryRegistry { - inner: BTreeMap, -} - -impl std::fmt::Debug for DictionaryRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); - f.debug_struct("DictionaryRegistry") - .field("len", &self.inner.len()) - .field("names", &names) - .finish() - } -} - -impl DictionaryRegistry { - /// Create an empty registry. - pub fn new() -> Self { - Self { - inner: BTreeMap::new(), - } - } - - /// Insert a dictionary, keyed by its [`Dictionary::name`]. - pub fn insert(&mut self, dict: BoxDictionary) { - let name = dict.name().to_owned(); - self.inner.insert(name, dict); - } - - /// Look up a dictionary by name. - #[must_use] - pub fn get(&self, name: &str) -> Option<&dyn Dictionary> { - self.inner.get(name).map(|b| b.as_ref()) - } - - /// Total number of registered dictionaries. - #[must_use] - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Load all `.txt` and `.csv` files from the embedded - /// `assets/dictionaries/` directory and return a populated registry. - /// - /// Unrecognised file extensions are logged as warnings and skipped. - #[tracing::instrument(name = "dictionaries.load_builtins", fields(count))] - pub fn load_builtins() -> Self { - static DICT_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/dictionaries"); - - let mut reg = Self::new(); - - for file in DICT_DIR.files() { - let path = file.path(); - let text = file - .contents_utf8() - .expect("dictionary file is not valid UTF-8"); - - let name = path - .file_stem() - .expect("dictionary path has no file stem") - .to_string_lossy(); - - let dict: BoxDictionary = match path.extension().and_then(|e| e.to_str()) { - Some("txt") => Box::new(TxtDictionary::new(name.as_ref(), text)), - Some("csv") => Box::new(CsvDictionary::new(name.as_ref(), text)), - other => { - tracing::warn!( - path = %path.display(), - extension = ?other, - "skipping unrecognised dictionary file", - ); - continue; - } - }; - - tracing::trace!( - name = dict.name(), - entries = dict.entries().len(), - "dictionary loaded", - ); - reg.insert(dict); - } - - tracing::Span::current().record("count", reg.len()); - tracing::debug!("built-in dictionaries loaded"); - reg - } -} - -impl Default for DictionaryRegistry { - fn default() -> Self { - Self::new() - } -} - -static BUILTIN_REGISTRY: LazyLock = - LazyLock::new(DictionaryRegistry::load_builtins); - -/// Return a reference to the lazily-initialised built-in [`DictionaryRegistry`]. -pub fn builtin_registry() -> &'static DictionaryRegistry { - &BUILTIN_REGISTRY -} - -#[cfg(test)] -mod tests { - use super::*; - - fn registry() -> &'static DictionaryRegistry { - builtin_registry() - } - - #[test] - fn builtins_load_and_are_nonempty() { - let reg = registry(); - assert!(reg.len() > 0); - for (_, dict) in ®.inner { - assert!(!dict.entries().is_empty(), "{} is empty", dict.name()); - } - } - - #[test] - fn entries_are_trimmed_and_nonempty() { - for (_, dict) in ®istry().inner { - let name = dict.name(); - for entry in dict.entries() { - assert!(!entry.is_empty(), "empty entry in {name}"); - assert_eq!(*entry, entry.trim(), "untrimmed entry in {name}: {entry:?}"); - } - } - } - - #[test] - fn registry_names_are_sorted() { - let keys: Vec<&str> = registry().inner.keys().map(|s| s.as_str()).collect(); - let mut sorted = keys.clone(); - sorted.sort(); - assert_eq!(keys, sorted); - } - - #[test] - fn registry_insert_and_get() { - let mut reg = DictionaryRegistry::new(); - let dict: BoxDictionary = Box::new(TxtDictionary::new("test", "foo\nbar\n")); - reg.insert(dict); - - assert_eq!(reg.len(), 1); - - let dict = reg.get("test").unwrap(); - assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["foo", "bar"]); - } -} +pub use self::csv_dictionary::CsvDictionary; +pub(crate) use self::csv_error::CsvDictionaryError; +pub use self::dictionary::{BoxDictionary, Dictionary, DictionaryTerm}; +pub(crate) use self::dictionary_error::DictionaryLoadError; +pub use self::dictionary_registry::{DictionaryRegistry, builtin_registry}; +pub use self::text_dictionary::TxtDictionary; diff --git a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs index 6273470d..4916aecf 100644 --- a/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/text_dictionary.rs @@ -1,12 +1,14 @@ -//! Plain-text dictionary: one matchable entry per line. +//! Plain-text dictionary: one entry per line. -use super::Dictionary; +use std::path::Path; + +use super::{Dictionary, DictionaryTerm}; /// A dictionary parsed from a plain-text file (one entry per line). -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct TxtDictionary { name: String, - entries: Vec, + terms: Vec, } impl TxtDictionary { @@ -17,14 +19,34 @@ impl TxtDictionary { pub fn new(name: impl Into, text: &str) -> Self { let name = name.into(); - let entries = text + let terms = text .lines() .map(|l| l.trim()) .filter(|l| !l.is_empty()) - .map(String::from) + .map(|l| DictionaryTerm { + value: l.to_owned(), + column: None, + }) .collect(); - Self { name, entries } + Self { name, terms } + } + + /// Load a plain-text dictionary from a file path. + /// + /// The dictionary name is derived from the file stem. + /// + /// # Errors + /// + /// Returns [`std::io::Error`] if the file cannot be read. + pub fn from_path(path: impl AsRef) -> std::io::Result { + let path = path.as_ref(); + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or_default(); + let text = std::fs::read_to_string(path)?; + Ok(Self::new(name, &text)) } } @@ -33,8 +55,8 @@ impl Dictionary for TxtDictionary { &self.name } - fn entries(&self) -> &[String] { - &self.entries + fn terms(&self) -> &[DictionaryTerm] { + &self.terms } } @@ -46,6 +68,10 @@ mod tests { fn parses_lines() { let dict = TxtDictionary::new("test", "alpha\n beta \n\ngamma\n"); assert_eq!(dict.name(), "test"); - assert_eq!(dict.entries(), &["alpha", "beta", "gamma"]); + + let values: Vec<&str> = dict.terms().iter().map(|t| t.value.as_str()).collect(); + assert_eq!(values, &["alpha", "beta", "gamma"]); + + assert!(dict.terms().iter().all(|t| t.column.is_none())); } } diff --git a/crates/nvisy-pattern/src/engine/allow_list.rs b/crates/nvisy-pattern/src/engine/allow_list.rs index 24ee8b9b..f08b6cc2 100644 --- a/crates/nvisy-pattern/src/engine/allow_list.rs +++ b/crates/nvisy-pattern/src/engine/allow_list.rs @@ -1,4 +1,4 @@ -//! [`AllowList`] — exact-match suppression of known false positives. +//! [`AllowList`]: exact-match suppression of known false positives. use std::collections::HashSet; diff --git a/crates/nvisy-pattern/src/engine/builder.rs b/crates/nvisy-pattern/src/engine/builder.rs index 3cfb4f8d..acb56a34 100644 --- a/crates/nvisy-pattern/src/engine/builder.rs +++ b/crates/nvisy-pattern/src/engine/builder.rs @@ -1,25 +1,21 @@ -//! [`PatternEngineBuilder`] — configures and compiles a [`PatternEngine`]. +//! [`PatternEngineBuilder`]: configures and compiles a [`PatternEngine`]. use regex::{Regex, RegexSet}; -use super::allow_list::AllowList; -use super::deny_list::DenyList; use super::error::PatternEngineError; -use super::{DictEntry, PatternEngine, RegexEntry}; +use super::{DictEntry, PatternEngine, RegexEntry, TARGET}; use crate::dictionaries; -use crate::patterns::{self, MatchSource, Pattern}; +use crate::patterns::{MatchSource, Pattern}; use crate::validators::ValidatorResolver; /// Builder for [`PatternEngine`]. /// /// By default all built-in patterns are included. Use -/// [`patterns`](Self::patterns) to restrict to a subset. +/// [`with_patterns`](Self::with_patterns) to restrict to a subset. #[derive(Default)] pub struct PatternEngineBuilder { pattern_names: Option>, confidence_threshold: f64, - allow_list: AllowList, - deny_list: DenyList, } impl PatternEngineBuilder { @@ -37,46 +33,27 @@ impl PatternEngineBuilder { /// Set the minimum confidence score for matches. /// /// Matches with confidence below this value are discarded during - /// [`scan_text`](PatternEngine::scan_text). Defaults to `0.0`. + /// [`scan_text`](PatternEngine::scan_text). Defaults to `0.0`. pub fn with_confidence_threshold(mut self, threshold: f64) -> Self { self.confidence_threshold = threshold; self } - /// Set the allow list. - /// - /// Matches whose exact value appears in the allow list are suppressed - /// (dropped) during [`scan_text`](PatternEngine::scan_text). - pub fn with_allow(mut self, list: AllowList) -> Self { - self.allow_list = list; - self - } - - /// Set the deny list. - /// - /// If a deny-list value is found in the scanned text but was not matched - /// by any regex or dictionary pattern, it is injected as a synthetic match - /// with confidence `1.0`. - pub fn with_deny(mut self, list: DenyList) -> Self { - self.deny_list = list; - self - } - /// Compile all selected patterns and build the engine. /// /// # Errors /// - /// Returns [`PatternEngineError`] if a regex fails to compile, a + /// Returns [`nvisy_core::Error`] if a regex fails to compile, a /// referenced dictionary is missing, or the Aho-Corasick automaton /// cannot be built. - #[tracing::instrument(name = "PatternEngine::build", skip(self))] - pub fn build(self) -> Result { - let pat_reg = patterns::builtin_registry(); + #[tracing::instrument(target = TARGET, name = "PatternEngine::build", skip(self))] + pub fn build(self) -> nvisy_core::Result { + let pat_reg = crate::patterns::builtin_registry(); let dict_reg = dictionaries::builtin_registry(); let active: Vec<&dyn Pattern> = match &self.pattern_names { Some(names) => names.iter().filter_map(|n| pat_reg.get(n)).collect(), - None => pat_reg.values(), + None => pat_reg.iter().collect(), }; let mut regex_entries = Vec::new(); @@ -86,15 +63,16 @@ impl PatternEngineBuilder { for p in &active { match p.match_source() { MatchSource::Regex(rp) => { + let effective = rp.effective_regex(); let compiled = - Regex::new(&rp.regex).map_err(|e| PatternEngineError::RegexCompile { + Regex::new(&effective).map_err(|e| PatternEngineError::RegexCompile { name: p.name().to_owned(), source: e, })?; - regex_strings.push(rp.regex.clone()); + regex_strings.push(effective); regex_entries.push(RegexEntry { pattern_name: p.name().to_owned(), - category: p.category().clone(), + category: p.category(), entity_kind: p.entity_kind(), confidence: rp.confidence, validator_name: rp.validator.clone(), @@ -109,11 +87,12 @@ impl PatternEngineBuilder { dictionary: dp.name.clone(), } })?; - let values: Vec = dict.entries().to_vec(); - if values.is_empty() { + let terms = dict.terms(); + if terms.is_empty() { continue; } - let columns = dict.columns().map(|c| c.to_vec()); + let values: Vec = terms.iter().map(|t| t.value.clone()).collect(); + let columns: Vec> = terms.iter().map(|t| t.column).collect(); let automaton = aho_corasick::AhoCorasickBuilder::new() .ascii_case_insensitive(!dp.case_sensitive) .build(&values) @@ -123,7 +102,7 @@ impl PatternEngineBuilder { })?; dict_entries.push(DictEntry { pattern_name: p.name().to_owned(), - category: p.category().clone(), + category: p.category(), entity_kind: p.entity_kind(), confidence: dp.confidence.clone(), automaton, @@ -140,6 +119,7 @@ impl PatternEngineBuilder { let validators = ValidatorResolver::builtins(); tracing::debug!( + target: TARGET, regex_count = regex_entries.len(), dict_count = dict_entries.len(), "PatternEngine built", @@ -151,8 +131,6 @@ impl PatternEngineBuilder { dict_entries, validators, confidence_threshold: self.confidence_threshold, - allow_set: self.allow_list, - deny_set: self.deny_list, }) } } diff --git a/crates/nvisy-pattern/src/engine/deny_list.rs b/crates/nvisy-pattern/src/engine/deny_list.rs index 946784c0..9fbb2f3d 100644 --- a/crates/nvisy-pattern/src/engine/deny_list.rs +++ b/crates/nvisy-pattern/src/engine/deny_list.rs @@ -1,37 +1,42 @@ -//! [`DenyList`] — forced detection of known sensitive values. +//! [`DenyList`]: forced detection of known sensitive values. -use std::collections::HashMap; +use std::collections::BTreeMap; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; -/// A deny-list entry: a known sensitive value that must always be detected. +/// A deny-list rule: a known sensitive value that must always be detected. #[derive(Debug, Clone)] -pub struct DenyEntry { +pub struct DenyRule { /// Entity category for the injected match. pub category: EntityCategory, /// Entity kind for the injected match. pub entity_kind: EntityKind, + /// Recognition method carried from the original detection source. + pub method: RecognitionMethod, } /// Exact-match deny list for forcing detection of known sensitive values. /// /// If a deny-list value is found in the scanned text but was not already /// matched by any regex or dictionary pattern, it is injected as a synthetic -/// [`PatternMatch`](super::PatternMatch) with confidence `1.0` and source -/// [`DetectionSource::DenyList`](super::DetectionSource::DenyList). +/// [`RawMatch`](super::RawMatch) with confidence `1.0` and +/// `pattern_name: None`. /// /// # Examples /// /// ```rust,ignore -/// use nvisy_ontology::entity::{EntityCategory, EntityKind}; +/// use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; /// /// let deny = DenyList::new() -/// .with("John Doe", EntityCategory::Pii, EntityKind::PersonName) -/// .with("ACME Corp", EntityCategory::Pii, EntityKind::Organization); +/// .with("John Doe", DenyRule { +/// category: EntityCategory::PersonalIdentity, +/// entity_kind: EntityKind::PersonName, +/// method: RecognitionMethod::Ner, +/// }); /// ``` #[derive(Debug, Clone, Default)] pub struct DenyList { - pub(crate) entries: HashMap, + pub(crate) entries: BTreeMap, } impl DenyList { @@ -40,37 +45,15 @@ impl DenyList { Self::default() } - /// Add a single entry. - pub fn with( - mut self, - value: impl Into, - category: EntityCategory, - entity_kind: EntityKind, - ) -> Self { - self.entries.insert( - value.into(), - DenyEntry { - category, - entity_kind, - }, - ); + /// Add a single rule (builder style). + pub fn with(mut self, value: impl Into, rule: DenyRule) -> Self { + self.entries.insert(value.into(), rule); self } - /// Insert an entry into this list. - pub fn insert( - &mut self, - value: impl Into, - category: EntityCategory, - entity_kind: EntityKind, - ) { - self.entries.insert( - value.into(), - DenyEntry { - category, - entity_kind, - }, - ); + /// Insert a rule into this list. + pub fn insert(&mut self, value: impl Into, rule: DenyRule) { + self.entries.insert(value.into(), rule); } /// Whether the list contains the given value. @@ -79,9 +62,9 @@ impl DenyList { self.entries.contains_key(value) } - /// Look up the entry for a value. + /// Look up the rule for a value. #[must_use] - pub fn get(&self, value: &str) -> Option<&DenyEntry> { + pub fn get(&self, value: &str) -> Option<&DenyRule> { self.entries.get(value) } @@ -97,18 +80,8 @@ impl DenyList { self.entries.is_empty() } - /// Iterate over (value, entry) pairs. - pub fn iter(&self) -> impl Iterator { + /// Iterate over (value, rule) pairs. + pub fn iter(&self) -> impl Iterator { self.entries.iter().map(|(k, v)| (k.as_str(), v)) } } - -impl> FromIterator<(S, EntityCategory, EntityKind)> for DenyList { - fn from_iter>(iter: I) -> Self { - let mut list = Self::new(); - for (value, category, entity_kind) in iter { - list.insert(value, category, entity_kind); - } - list - } -} diff --git a/crates/nvisy-pattern/src/engine/error.rs b/crates/nvisy-pattern/src/engine/error.rs index 4de9a389..337a18b0 100644 --- a/crates/nvisy-pattern/src/engine/error.rs +++ b/crates/nvisy-pattern/src/engine/error.rs @@ -1,8 +1,12 @@ -//! Errors produced during [`PatternEngine`](super::PatternEngine) construction. +//! Errors produced during [`PatternEngine`] construction. +//! +//! [`PatternEngine`]: super::PatternEngine + +use nvisy_core::{Error, ErrorKind}; /// Errors that can occur while building a [`PatternEngine`](super::PatternEngine). #[derive(Debug, thiserror::Error)] -pub enum PatternEngineError { +pub(crate) enum PatternEngineError { /// A regex pattern string failed to compile. #[error("failed to compile regex for pattern '{name}': {source}")] RegexCompile { name: String, source: regex::Error }, @@ -19,3 +23,11 @@ pub enum PatternEngineError { #[error("failed to build RegexSet pre-filter: {0}")] RegexSetBuild(regex::Error), } + +impl From for Error { + fn from(err: PatternEngineError) -> Self { + Error::new(ErrorKind::Validation, err.to_string()) + .with_component("nvisy-pattern::engine") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/engine/mod.rs b/crates/nvisy-pattern/src/engine/mod.rs index fcf70770..0883ca07 100644 --- a/crates/nvisy-pattern/src/engine/mod.rs +++ b/crates/nvisy-pattern/src/engine/mod.rs @@ -2,39 +2,41 @@ //! //! [`PatternEngine`] compiles all built-in (and optionally user-selected) //! regex patterns and dictionary automata into a single unit that can -//! scan text in one call. Use [`PatternEngineBuilder`] for configuration -//! or [`default_engine`] for an out-of-the-box singleton. +//! scan text in one call. Use [`PatternEngine::builder`] for configuration +//! or [`PatternEngine::instance`] for an out-of-the-box singleton. //! //! # Key types //! -//! - [`PatternEngine`]: the pre-compiled scanning engine. -//! - [`PatternEngineBuilder`]: builder for configuring patterns, thresholds, -//! and allow/deny lists. -//! - [`PatternMatch`]: a single match produced by scanning. -//! - [`DetectionSource`]: how a match was produced (regex, dictionary, deny list). +//! - [`PatternEngine`]: pre-compiled scanning engine. +//! - [`ScanContext`]: per-scan allow/deny list configuration. +//! - [`RawMatch`]: single match produced by scanning. //! - [`AllowList`] / [`DenyList`]: exact-match suppression and forced detection. -//! - [`PatternEngineError`]: build-time errors. +//! - [`PatternEngineBuilder`]: builder for configuring patterns and thresholds. mod allow_list; mod builder; mod deny_list; mod error; mod pattern_match; +mod scan_context; +use std::collections::HashSet; use std::sync::LazyLock; use aho_corasick::AhoCorasick; -pub use allow_list::AllowList; -pub use builder::PatternEngineBuilder; -pub use deny_list::{DenyEntry, DenyList}; -pub use error::PatternEngineError; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; -pub use pattern_match::{DetectionSource, PatternMatch}; +use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; use regex::{Regex, RegexSet}; +pub use self::allow_list::AllowList; +pub use self::builder::PatternEngineBuilder; +pub use self::deny_list::{DenyList, DenyRule}; +pub use self::pattern_match::RawMatch; +pub use self::scan_context::ScanContext; use crate::patterns::{ContextRule, DictionaryConfidence}; use crate::validators::ValidatorResolver; +const TARGET: &str = "nvisy_pattern::engine"; + /// Metadata stored alongside each compiled regex. struct RegexEntry { pattern_name: String, @@ -56,8 +58,8 @@ struct DictEntry { /// The terms used to build the automaton, indexed by pattern id. values: Vec, /// Per-entry column index from the source dictionary (parallel to `values`). - /// `None` for plain-text dictionaries (all entries are column 0). - columns: Option>, + /// `None` entries indicate plain-text origin (logically column 0). + columns: Vec>, context: Option, } @@ -66,9 +68,10 @@ impl DictEntry { fn resolve_confidence(&self, pattern_index: usize) -> f64 { let col = self .columns - .as_ref() - .and_then(|cols| cols.get(pattern_index).copied()) - .unwrap_or(0); + .get(pattern_index) + .copied() + .flatten() + .unwrap_or(0) as usize; self.confidence.resolve(col) } } @@ -77,25 +80,23 @@ impl DictEntry { /// /// Scanning runs in three phases: /// -/// 1. **Regex** — a [`RegexSet`] pre-filter selects candidate patterns, +/// 1. **Regex**: a [`RegexSet`] pre-filter selects candidate patterns, /// then each matching regex extracts offsets and values. -/// 2. **Dictionary** — Aho-Corasick automata perform literal multi-pattern +/// 2. **Dictionary**: Aho-Corasick automata perform literal multi-pattern /// matching against known-value dictionaries. -/// 3. **Deny list** — known sensitive values not already matched are +/// 3. **Deny list**: known sensitive values not already matched are /// injected as synthetic matches with confidence `1.0`. /// /// Allow-list filtering is applied inline during phases 1 and 2. /// -/// Build via [`PatternEngine::builder`] or use [`default_engine`] for -/// the singleton with all built-in patterns. +/// Build via [`PatternEngine::builder`] or use [`PatternEngine::instance`] +/// for the singleton with all built-in patterns. pub struct PatternEngine { regex_set: RegexSet, regex_entries: Vec, dict_entries: Vec, validators: ValidatorResolver, confidence_threshold: f64, - allow_set: AllowList, - deny_set: DenyList, } impl std::fmt::Debug for PatternEngine { @@ -109,46 +110,37 @@ impl std::fmt::Debug for PatternEngine { } impl PatternEngine { + /// Return a reference to the lazily-initialised default engine + /// containing all built-in patterns. + pub fn instance() -> &'static Self { + &DEFAULT_ENGINE + } + /// Create a new [`PatternEngineBuilder`]. pub fn builder() -> PatternEngineBuilder { PatternEngineBuilder::default() } - /// Validate a value using the checksum associated with the entity kind. - /// - /// Returns `Some(true)` if the value passes, `Some(false)` if it fails, - /// or `None` if no checksum validator is registered for that entity kind. - pub fn validate_checksum(&self, entity_kind: EntityKind, value: &str) -> Option { - let validator_name = match entity_kind { - EntityKind::PaymentCard => "luhn", - EntityKind::GovernmentId => "ssn", - EntityKind::Iban => "iban", - _ => return None, - }; - let validate = self.validators.resolve(validator_name)?; - Some(validate(value)) - } - /// Scan `text` and return all matches above the confidence threshold. /// /// Matches whose value appears in the allow list are suppressed. /// Deny-list values found in the text are injected as synthetic matches /// with confidence `1.0` when not already matched. - #[tracing::instrument(skip(self, text), fields(text_len = text.len(), matches))] - pub fn scan_text(&self, text: &str) -> Vec { + #[tracing::instrument(target = TARGET, skip(self, text, ctx), fields(text_len = text.len(), matches = tracing::field::Empty))] + pub fn scan_text(&self, text: &str, ctx: &ScanContext) -> Vec { let mut results = Vec::new(); - self.scan_regex(text, &mut results); - self.scan_dict(text, &mut results); - self.scan_deny_list(text, &mut results); + self.scan_regex(text, &ctx.allow, &mut results); + self.scan_dict(text, &ctx.allow, &mut results); + self.scan_deny_list(text, &ctx.deny, &mut results); tracing::Span::current().record("matches", results.len()); results } - /// Phase 1: regex matches — use `RegexSet` as a pre-filter, then run + /// Phase 1: regex matches. Uses `RegexSet` as a pre-filter, then runs /// each matching regex individually to extract offsets and values. - fn scan_regex(&self, text: &str, results: &mut Vec) { + fn scan_regex(&self, text: &str, allow: &AllowList, results: &mut Vec) { let set_matches = self.regex_set.matches(text); for idx in set_matches.iter() { let entry = &self.regex_entries[idx]; @@ -160,26 +152,30 @@ impl PatternEngine { for mat in entry.regex.find_iter(text) { let value = mat.as_str(); - if self.allow_set.contains(value) { + if allow.contains(value) { continue; } + let mut methods = vec![RecognitionMethod::Regex]; + if let Some(ref vname) = entry.validator_name && let Some(validate) = self.validators.resolve(vname) - && !validate(value) { - continue; + if !validate(value) { + continue; + } + methods.push(RecognitionMethod::Checksum); } - results.push(PatternMatch { - pattern_name: entry.pattern_name.clone(), - category: entry.category.clone(), + results.push(RawMatch { + pattern_name: Some(entry.pattern_name.clone()), + category: entry.category, entity_kind: entry.entity_kind, value: value.to_owned(), start: mat.start(), end: mat.end(), confidence: entry.confidence, - source: DetectionSource::Regex, + recognition_methods: methods, context: entry.context.clone(), }); } @@ -187,64 +183,66 @@ impl PatternEngine { } /// Phase 2: dictionary matches via Aho-Corasick automata. - fn scan_dict(&self, text: &str, results: &mut Vec) { + fn scan_dict(&self, text: &str, allow: &AllowList, results: &mut Vec) { for entry in &self.dict_entries { for mat in entry.automaton.find_iter(text) { let pat_idx = mat.pattern().as_usize(); let value = &entry.values[pat_idx]; - // Resolve per-entry confidence: use column override if available, - // otherwise fall back to the pattern's base confidence. let confidence = entry.resolve_confidence(pat_idx); if confidence < self.confidence_threshold { continue; } - if self.allow_set.contains(value.as_str()) { + if allow.contains(value.as_str()) { continue; } - results.push(PatternMatch { - pattern_name: entry.pattern_name.clone(), - category: entry.category.clone(), + results.push(RawMatch { + pattern_name: Some(entry.pattern_name.clone()), + category: entry.category, entity_kind: entry.entity_kind, value: value.clone(), start: mat.start(), end: mat.end(), confidence, - source: DetectionSource::Dictionary, + recognition_methods: vec![RecognitionMethod::Dictionary], context: entry.context.clone(), }); } } } - /// Phase 3: inject deny-list values found in `text` that were not - /// already matched by regex or dictionary. - fn scan_deny_list(&self, text: &str, results: &mut Vec) { - for (deny_value, deny_entry) in self.deny_set.iter() { - if results.iter().any(|r| r.value == deny_value) { + /// Phase 3: inject deny-list values found in `text` not already + /// matched by regex or dictionary. + fn scan_deny_list(&self, text: &str, deny: &DenyList, results: &mut Vec) { + let matched_values: HashSet<&str> = results.iter().map(|r| r.value.as_str()).collect(); + + let mut deny_matches = Vec::new(); + for (deny_value, deny_rule) in deny.iter() { + if matched_values.contains(deny_value) { continue; } let mut search_start = 0; while let Some(pos) = text[search_start..].find(deny_value) { let abs_start = search_start + pos; let abs_end = abs_start + deny_value.len(); - results.push(PatternMatch { - pattern_name: String::new(), - category: deny_entry.category.clone(), - entity_kind: deny_entry.entity_kind, + deny_matches.push(RawMatch { + pattern_name: None, + category: deny_rule.category, + entity_kind: deny_rule.entity_kind, value: deny_value.to_owned(), start: abs_start, end: abs_end, confidence: 1.0, - source: DetectionSource::DenyList, + recognition_methods: vec![deny_rule.method], context: None, }); search_start = abs_end; } } + results.extend(deny_matches); } } @@ -254,28 +252,28 @@ static DEFAULT_ENGINE: LazyLock = LazyLock::new(|| { .expect("built-in patterns must compile") }); -/// Return a reference to the lazily-initialised default [`PatternEngine`] -/// containing all built-in patterns. -pub fn default_engine() -> &'static PatternEngine { - &DEFAULT_ENGINE -} - #[cfg(test)] mod tests { use super::*; + fn empty_ctx() -> ScanContext { + ScanContext::default() + } + #[test] fn default_engine_builds() { - let engine = default_engine(); + let engine = PatternEngine::instance(); assert!(!engine.regex_entries.is_empty()); } #[test] fn scan_text_finds_ssn() { - let engine = default_engine(); - let matches = engine.scan_text("My SSN is 123-45-6789."); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("My SSN is 123-45-6789.", &empty_ctx()); assert!( - matches.iter().any(|m| m.pattern_name == "ssn"), + matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "expected SSN match, got: {:?}", matches.iter().map(|m| &m.pattern_name).collect::>() ); @@ -283,10 +281,12 @@ mod tests { #[test] fn scan_text_finds_email() { - let engine = default_engine(); - let matches = engine.scan_text("Contact: alice@example.com"); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("Contact: alice@example.com", &empty_ctx()); assert!( - matches.iter().any(|m| m.pattern_name == "email"), + matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("email")), "expected email match, got: {:?}", matches.iter().map(|m| &m.pattern_name).collect::>() ); @@ -298,9 +298,11 @@ mod tests { .with_confidence_threshold(0.99) .build() .unwrap(); - let matches = engine.scan_text("My SSN is 123-45-6789."); + let matches = engine.scan_text("My SSN is 123-45-6789.", &empty_ctx()); assert!( - !matches.iter().any(|m| m.pattern_name == "ssn"), + !matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "SSN should be filtered by 0.99 threshold" ); } @@ -317,25 +319,28 @@ mod tests { #[test] fn scan_text_returns_correct_offsets() { - let engine = default_engine(); + let engine = PatternEngine::instance(); let text = "SSN: 123-45-6789"; - let matches = engine.scan_text(text); - let ssn_match = matches.iter().find(|m| m.pattern_name == "ssn").unwrap(); + let matches = engine.scan_text(text, &empty_ctx()); + let ssn_match = matches + .iter() + .find(|m| m.pattern_name.as_deref() == Some("ssn")) + .unwrap(); assert_eq!(&text[ssn_match.start..ssn_match.end], "123-45-6789"); } #[test] fn dictionary_matches_are_found() { - let engine = default_engine(); - let matches = engine.scan_text("She is American and speaks English."); + let engine = PatternEngine::instance(); + let matches = engine.scan_text("She is American and speaks English.", &empty_ctx()); assert!( - matches - .iter() - .any(|m| m.source == DetectionSource::Dictionary), + matches.iter().any(|m| m + .recognition_methods + .contains(&RecognitionMethod::Dictionary)), "expected dictionary match, got: {:?}", matches .iter() - .map(|m| (&m.pattern_name, &m.source)) + .map(|m| (&m.pattern_name, &m.recognition_methods)) .collect::>() ); } @@ -344,12 +349,14 @@ mod tests { fn allow_list_suppresses_match() { let engine = PatternEngine::builder() .with_patterns(&["ssn"]) - .with_allow(AllowList::new().with("123-45-6789")) .build() .unwrap(); - let matches = engine.scan_text("SSN: 123-45-6789"); + let ctx = ScanContext::new().with_allow(AllowList::new().with("123-45-6789")); + let matches = engine.scan_text("SSN: 123-45-6789", &ctx); assert!( - !matches.iter().any(|m| m.pattern_name == "ssn"), + !matches + .iter() + .any(|m| m.pattern_name.as_deref() == Some("ssn")), "allow-listed value should be suppressed" ); } @@ -358,37 +365,46 @@ mod tests { fn deny_list_injects_match() { let deny = DenyList::new().with( "secret-value-42", - EntityCategory::Pii, - EntityKind::PersonName, + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + }, ); let engine = PatternEngine::builder() .with_patterns(&["email"]) - .with_deny(deny) .build() .unwrap(); - let matches = engine.scan_text("The secret-value-42 should be detected."); + let ctx = ScanContext::new().with_deny(deny); + let matches = engine.scan_text("The secret-value-42 should be detected.", &ctx); let deny_match = matches .iter() - .find(|m| m.source == DetectionSource::DenyList) + .find(|m| m.pattern_name.is_none()) .expect("deny list value should be injected"); assert_eq!(deny_match.value, "secret-value-42"); assert_eq!(deny_match.confidence, 1.0); assert_eq!(deny_match.entity_kind, EntityKind::PersonName); + assert_eq!(deny_match.recognition_methods, vec![RecognitionMethod::Ner]); } #[test] fn deny_list_not_injected_when_absent() { - let deny = DenyList::new().with("not-in-text", EntityCategory::Pii, EntityKind::PersonName); + let deny = DenyList::new().with( + "not-in-text", + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Manual, + }, + ); let engine = PatternEngine::builder() .with_patterns(&["email"]) - .with_deny(deny) .build() .unwrap(); - let matches = engine.scan_text("Nothing special here."); + let ctx = ScanContext::new().with_deny(deny); + let matches = engine.scan_text("Nothing special here.", &ctx); assert!( - !matches - .iter() - .any(|m| m.source == DetectionSource::DenyList), + !matches.iter().any(|m| m.pattern_name.is_none()), "deny list value not in text should not be injected" ); } @@ -403,24 +419,36 @@ mod tests { } #[test] - fn deny_list_from_iterator() { - let deny: DenyList = [ - ("secret", EntityCategory::Pii, EntityKind::PersonName), - ("other", EntityCategory::Financial, EntityKind::PaymentCard), - ] - .into_iter() - .collect(); + fn deny_list_insert_and_lookup() { + let mut deny = DenyList::new(); + deny.insert( + "secret", + DenyRule { + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::PersonName, + method: RecognitionMethod::Ner, + }, + ); + deny.insert( + "other", + DenyRule { + category: EntityCategory::Financial, + entity_kind: EntityKind::PaymentCard, + method: RecognitionMethod::Manual, + }, + ); assert_eq!(deny.len(), 2); assert!(deny.contains("secret")); - let entry = deny.get("other").unwrap(); - assert_eq!(entry.category, EntityCategory::Financial); + let rule = deny.get("other").unwrap(); + assert_eq!(rule.category, EntityCategory::Financial); + assert_eq!(rule.method, RecognitionMethod::Manual); } #[test] fn column_confidence_applies_to_csv_dictionaries() { - let engine = default_engine(); + let engine = PatternEngine::instance(); // "US Dollar" is column 0 (full name), "USD" is column 1 (code). - let matches = engine.scan_text("I paid in US Dollar and also in USD."); + let matches = engine.scan_text("I paid in US Dollar and also in USD.", &empty_ctx()); let full_name = matches.iter().find(|m| m.value == "US Dollar"); let code = matches.iter().find(|m| m.value == "USD"); assert!(full_name.is_some(), "should match 'US Dollar'"); @@ -439,15 +467,42 @@ mod tests { .with_patterns(&["ssn"]) .build() .unwrap(); - let matches = engine.scan_text("SSN: 123-45-6789"); - let ssn_match = matches.iter().find(|m| m.pattern_name == "ssn").unwrap(); + let matches = engine.scan_text("SSN: 123-45-6789", &empty_ctx()); + let ssn_match = matches + .iter() + .find(|m| m.pattern_name.as_deref() == Some("ssn")) + .unwrap(); assert!( ssn_match.context.is_some(), - "SSN pattern should carry context rule through to PatternMatch" + "SSN pattern should carry context rule through to RawMatch" ); let ctx = ssn_match.context.as_ref().unwrap(); assert!(!ctx.keywords.is_empty()); assert!(ctx.window > 0); assert!(ctx.boost > 0.0); } + + #[test] + fn into_entity_builds_entity_without_location() { + let raw = RawMatch { + pattern_name: Some("ssn".into()), + category: EntityCategory::PersonalIdentity, + entity_kind: EntityKind::GovernmentId, + value: "123-45-6789".into(), + start: 5, + end: 16, + confidence: 0.9, + recognition_methods: vec![RecognitionMethod::Regex, RecognitionMethod::Checksum], + context: None, + }; + let entity = raw.into_entity(); + assert_eq!(entity.value, "123-45-6789"); + assert_eq!(entity.entity_kind, EntityKind::GovernmentId); + assert_eq!( + entity.recognition_methods, + vec![RecognitionMethod::Regex, RecognitionMethod::Checksum] + ); + assert!((entity.confidence - 0.9).abs() < f64::EPSILON); + assert!(entity.location.is_none()); + } } diff --git a/crates/nvisy-pattern/src/engine/pattern_match.rs b/crates/nvisy-pattern/src/engine/pattern_match.rs index a968d613..8af6afb5 100644 --- a/crates/nvisy-pattern/src/engine/pattern_match.rs +++ b/crates/nvisy-pattern/src/engine/pattern_match.rs @@ -1,25 +1,15 @@ -//! [`PatternMatch`] and [`DetectionSource`] — output types from pattern scanning. +//! [`RawMatch`]: output type from pattern scanning. -use nvisy_ontology::entity::{EntityCategory, EntityKind}; +use nvisy_ontology::entity::{Entity, EntityCategory, EntityKind, RecognitionMethod}; use crate::patterns::ContextRule; -/// How the match was produced. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum DetectionSource { - /// Matched by a compiled regular expression. - Regex, - /// Matched by Aho-Corasick dictionary lookup. - Dictionary, - /// Injected by the deny list (known sensitive value). - DenyList, -} - /// A single match produced by [`PatternEngine::scan_text`](super::PatternEngine::scan_text). #[derive(Debug, Clone)] -pub struct PatternMatch { - /// Name of the pattern that produced this match. - pub pattern_name: String, +pub struct RawMatch { + /// Name of the pattern that produced this match, or `None` for + /// deny-list–injected matches. + pub pattern_name: Option, /// Entity category of the match. pub category: EntityCategory, /// Entity kind of the match. @@ -32,8 +22,38 @@ pub struct PatternMatch { pub end: usize, /// Confidence score assigned by the pattern definition. pub confidence: f64, - /// How this match was produced (regex, dictionary, or deny list). - pub source: DetectionSource, + /// Recognition methods that produced this match, ordered by + /// application time (e.g. `[Regex, Checksum]` when a regex + /// match was confirmed by a validator). + pub recognition_methods: Vec, /// Optional context rule for span-level co-occurrence scoring. pub context: Option, } + +impl RawMatch { + /// Build an [`Entity`] from this match. + /// + /// The returned entity has no location or parent set: the caller + /// should attach those from the span context via + /// [`Entity::with_location`] and [`Entity::with_parent`]. + /// + /// # Panics + /// + /// Panics if `recognition_methods` is empty. All engine-produced + /// matches always carry at least one method. + pub fn into_entity(self) -> Entity { + debug_assert!( + !self.recognition_methods.is_empty(), + "RawMatch::into_entity requires at least one recognition method" + ); + let mut entity = Entity::new( + self.category, + self.entity_kind, + self.value, + self.recognition_methods[0], + self.confidence, + ); + entity.recognition_methods = self.recognition_methods; + entity + } +} diff --git a/crates/nvisy-pattern/src/engine/scan_context.rs b/crates/nvisy-pattern/src/engine/scan_context.rs new file mode 100644 index 00000000..19e81769 --- /dev/null +++ b/crates/nvisy-pattern/src/engine/scan_context.rs @@ -0,0 +1,50 @@ +//! [`ScanContext`]: per-scan allow/deny list configuration. + +use super::allow_list::AllowList; +use super::deny_list::DenyList; + +/// Per-scan configuration for allow and deny lists. +/// +/// Passed to [`PatternEngine::scan_text`](super::PatternEngine::scan_text) +/// to control per-invocation suppression and forced detection without +/// rebuilding the engine. +/// +/// # Examples +/// +/// ```rust,ignore +/// use nvisy_pattern::prelude::*; +/// use nvisy_ontology::entity::{EntityCategory, EntityKind, RecognitionMethod}; +/// +/// let ctx = ScanContext::new() +/// .with_allow(AllowList::new().with("000-00-0000")) +/// .with_deny(DenyList::new().with("secret", DenyRule { +/// category: EntityCategory::PersonalIdentity, +/// entity_kind: EntityKind::PersonName, +/// method: RecognitionMethod::Manual, +/// })); +/// let matches = PatternEngine::instance().scan_text("text", &ctx); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct ScanContext { + pub(super) allow: AllowList, + pub(super) deny: DenyList, +} + +impl ScanContext { + /// Create an empty scan context (no allow/deny filtering). + pub fn new() -> Self { + Self::default() + } + + /// Set the allow list. + pub fn with_allow(mut self, list: AllowList) -> Self { + self.allow = list; + self + } + + /// Set the deny list. + pub fn with_deny(mut self, list: DenyList) -> Self { + self.deny = list; + self + } +} diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index 9f505274..f3555804 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -2,13 +2,14 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub(crate) mod dictionaries; -pub mod engine; -pub(crate) mod patterns; +pub mod dictionaries; +pub(crate) mod engine; +pub mod patterns; pub(crate) mod validators; -pub use engine::{DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch}; -pub use patterns::ContextRule; +pub use self::engine::{ + AllowList, DenyList, DenyRule, PatternEngine, PatternEngineBuilder, RawMatch, ScanContext, +}; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-pattern/src/patterns/context_rule.rs b/crates/nvisy-pattern/src/patterns/context_rule.rs index ef0feb0d..1e596bdc 100644 --- a/crates/nvisy-pattern/src/patterns/context_rule.rs +++ b/crates/nvisy-pattern/src/patterns/context_rule.rs @@ -1,29 +1,59 @@ -//! [`ContextRule`] — co-occurrence context for span-level confidence boosting. +//! [`ContextRule`]: co-occurrence context for span-level confidence boosting. -use serde::{Deserialize, Serialize}; +use serde::Deserialize; /// Co-occurrence context rule for span-level confidence boosting. /// /// When a pattern match is found, nearby spans are searched for any of the -/// `keywords`. If at least one keyword is present within `window` spans, +/// `keywords`. If at least one keyword is present within `window` spans, /// the match confidence is increased by `boost` (clamped to `[0.0, 1.0]`). -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] +#[serde(try_from = "RawContextRule")] pub struct ContextRule { /// Keywords to look for in nearby spans. pub keywords: Vec, /// Number of spans before and after the match span to search. - #[serde(default = "default_window")] pub window: usize, /// Confidence adjustment when at least one keyword is found. - #[serde(default = "default_boost")] + /// Must be in the range `[0.0, 1.0]`. pub boost: f64, /// Whether keyword matching is case-sensitive. /// - /// Defaults to `false` (case-insensitive). - #[serde(default)] + /// Defaults to `false`: case-insensitive. pub case_sensitive: bool, } +/// Serde intermediary that mirrors the JSON shape before validation. +#[derive(Debug, Clone, Deserialize)] +struct RawContextRule { + keywords: Vec, + #[serde(default = "default_window")] + window: usize, + #[serde(default = "default_boost")] + boost: f64, + #[serde(default)] + case_sensitive: bool, +} + +impl TryFrom for ContextRule { + type Error = String; + + fn try_from(raw: RawContextRule) -> Result { + if !(0.0..=1.0).contains(&raw.boost) { + return Err(format!( + "context rule boost must be in [0.0, 1.0], got {}", + raw.boost + )); + } + Ok(Self { + keywords: raw.keywords, + window: raw.window, + boost: raw.boost, + case_sensitive: raw.case_sensitive, + }) + } +} + fn default_window() -> usize { 3 } diff --git a/crates/nvisy-pattern/src/patterns/json_pattern.rs b/crates/nvisy-pattern/src/patterns/json_pattern.rs index 750e2ac3..cc7634d3 100644 --- a/crates/nvisy-pattern/src/patterns/json_pattern.rs +++ b/crates/nvisy-pattern/src/patterns/json_pattern.rs @@ -1,15 +1,16 @@ -//! JSON-backed `JsonPattern` implementation. +//! JSON-backed [`JsonPattern`] implementation. //! //! Each JSON file under `assets/patterns/` is deserialized into a -//! `JsonPattern` via `from_bytes`. The method returns the validated -//! pattern together with any non-fatal `JsonPatternWarning`s so the -//! caller can decide how to surface them. +//! [`JsonPattern`] via [`from_bytes`](JsonPattern::from_bytes). The method +//! returns the validated pattern together with any non-fatal +//! [`JsonPatternWarning`]s so the caller can decide how to surface them. use nvisy_ontology::entity::{EntityCategory, EntityKind}; use serde::Deserialize; use super::context_rule::ContextRule; use super::pattern::{DictionaryPattern, MatchSource, Pattern, RegexPattern}; +use crate::validators::ValidatorResolver; /// Error returned when a JSON pattern file cannot be loaded. #[derive(Debug, thiserror::Error)] @@ -25,10 +26,6 @@ pub enum JsonPatternError { /// indicate misconfiguration (e.g. a typo in the validator name). #[derive(Debug)] pub enum JsonPatternWarning { - /// The `"category"` value was not a recognised variant and fell through - /// to [`EntityCategory::Custom`]. - UnknownCategory { pattern: String, slug: String }, - /// The `"validator"` name does not match any built-in validator, so /// the pattern will have no post-match validation. UnknownValidator { pattern: String, validator: String }, @@ -37,7 +34,7 @@ pub enum JsonPatternWarning { /// A detection pattern deserialized from a JSON definition file. /// /// Implements the [`Pattern`] trait and is the only concrete implementation -/// shipped with this crate. Construct via `from_bytes`. +/// shipped with this crate. Construct via [`from_bytes`](Self::from_bytes). #[derive(Debug, Clone)] pub struct JsonPattern { name: String, @@ -50,6 +47,10 @@ pub struct JsonPattern { impl JsonPattern { /// Deserialize and validate a pattern from raw JSON bytes. /// + /// `validators` is used to check whether a referenced validator name + /// is registered: unrecognised names produce a [`JsonPatternWarning`] + /// but do not prevent loading. + /// /// On success returns the pattern together with a (possibly empty) /// list of [`JsonPatternWarning`]s. /// @@ -60,8 +61,9 @@ impl JsonPattern { /// and `dictionary`). pub(crate) fn from_bytes( bytes: &[u8], + validators: &ValidatorResolver, ) -> Result<(Self, Vec), JsonPatternError> { - /// Serde helper: exactly one of `pattern` or `dictionary`. + /// Serde helper: exactly one of `pattern` or `dictionary` must be present. #[derive(Deserialize)] #[serde(untagged)] enum RawSource { @@ -91,19 +93,11 @@ impl JsonPattern { let mut warnings = Vec::new(); - if let EntityCategory::Custom(ref slug) = raw.category { - warnings.push(JsonPatternWarning::UnknownCategory { - pattern: raw.name.clone(), - slug: slug.clone(), - }); - } if let MatchSource::Regex(RegexPattern { validator: Some(ref v), .. }) = match_source - && crate::validators::ValidatorResolver::builtins() - .resolve(v) - .is_none() + && validators.resolve(v).is_none() { warnings.push(JsonPatternWarning::UnknownValidator { pattern: raw.name.clone(), @@ -128,8 +122,8 @@ impl Pattern for JsonPattern { &self.name } - fn category(&self) -> &EntityCategory { - &self.category + fn category(&self) -> EntityCategory { + self.category } fn entity_kind(&self) -> EntityKind { diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index 6085e4b2..7adc4e6c 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -1,7 +1,7 @@ //! Built-in detection patterns. //! //! Each pattern is a JSON file under `assets/patterns/` that describes how -//! to detect a single entity type. Files are embedded at compile time with +//! to detect a single entity type. Files are embedded at compile time with //! `include_dir!` and auto-discovered by [`PatternRegistry::load_builtins`]. //! //! # Key types @@ -10,242 +10,17 @@ //! - [`JsonPattern`]: concrete implementation deserialized from JSON. //! - [`MatchSource`]: whether matching is regex-based or dictionary-based. //! - [`ContextRule`]: optional co-occurrence keywords for confidence boosting. -//! - [`PatternRegistry`]: sorted collection with O(log n) lookup by name. +//! - [`PatternRegistry`]: sorted collection with O(log n) lookup. //! - [`JsonPatternWarning`]: non-fatal load-time diagnostics. mod context_rule; mod json_pattern; mod pattern; - -use std::collections::BTreeMap; -use std::sync::LazyLock; - -pub use context_rule::ContextRule; -use include_dir::{Dir, include_dir}; -pub use json_pattern::{JsonPattern, JsonPatternWarning}; -pub use pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; - -/// A registry of named [`Pattern`] definitions with O(log n) lookup. -/// -/// Use [`load_builtins`] to create a registry pre-populated with -/// the compile-time-embedded pattern files. -/// -/// [`load_builtins`]: Self::load_builtins -pub struct PatternRegistry { - inner: BTreeMap, -} - -impl std::fmt::Debug for PatternRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); - f.debug_struct("PatternRegistry") - .field("len", &self.inner.len()) - .field("names", &names) - .finish() - } -} - -impl PatternRegistry { - /// Create an empty registry. - pub fn new() -> Self { - Self { - inner: BTreeMap::new(), - } - } - - /// Insert a pattern, keyed by its [`Pattern::name`]. - pub fn insert(&mut self, pattern: BoxPattern) { - let name = pattern.name().to_owned(); - self.inner.insert(name, pattern); - } - - /// Look up a pattern by name. - #[must_use] - pub fn get(&self, name: &str) -> Option<&dyn Pattern> { - self.inner.get(name).map(|b| b.as_ref()) - } - - /// All patterns in deterministic (alphabetical) order. - #[must_use] - pub fn values(&self) -> Vec<&dyn Pattern> { - self.inner.values().map(|b| b.as_ref()).collect() - } - - /// Total number of registered patterns. - #[must_use] - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Load all `.json` files from the embedded `assets/patterns/` - /// directory and return a populated registry. - /// - /// Files that fail to parse are logged as warnings and skipped. - #[tracing::instrument(name = "patterns.load_builtins", fields(count))] - pub fn load_builtins() -> Self { - static PATTERN_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/patterns"); - - let mut reg = Self::new(); - - for file in PATTERN_DIR.files() { - let path = file.path(); - - let Some("json") = path.extension().and_then(|e| e.to_str()) else { - tracing::warn!( - path = %path.display(), - "skipping non-JSON file in patterns directory", - ); - continue; - }; - - let (pattern, warnings) = match JsonPattern::from_bytes(file.contents()) { - Ok(pair) => pair, - Err(e) => { - tracing::warn!( - path = %path.display(), - error = %e, - "failed to load pattern, skipping", - ); - continue; - } - }; - - for w in &warnings { - match w { - JsonPatternWarning::UnknownCategory { pattern, slug } => { - tracing::warn!(%pattern, category = %slug, "unrecognised category falls through to Custom"); - } - JsonPatternWarning::UnknownValidator { pattern, validator } => { - tracing::warn!(%pattern, %validator, "unknown validator name, pattern will have no post-match validation"); - } - } - } - - tracing::trace!( - name = %pattern.name(), - category = %pattern.category(), - entity_kind = %pattern.entity_kind(), - match_source = ?pattern.match_source(), - "pattern loaded", - ); - reg.insert(Box::new(pattern)); - } - - tracing::Span::current().record("count", reg.len()); - tracing::debug!("built-in patterns loaded"); - reg - } -} - -impl Default for PatternRegistry { - fn default() -> Self { - Self::new() - } -} - -static BUILTIN_REGISTRY: LazyLock = LazyLock::new(PatternRegistry::load_builtins); - -/// Return a reference to the lazily-initialised built-in [`PatternRegistry`]. -pub fn builtin_registry() -> &'static PatternRegistry { - &BUILTIN_REGISTRY -} - -#[cfg(test)] -mod tests { - use super::pattern::RegexPattern; - use super::*; - - fn registry() -> &'static PatternRegistry { - builtin_registry() - } - - #[test] - fn builtins_load() { - assert!(registry().len() > 0); - } - - #[test] - fn pattern_names_are_sorted() { - let names: Vec<&str> = registry().values().iter().map(|p| p.name()).collect(); - let mut sorted = names.clone(); - sorted.sort(); - assert_eq!(names, sorted); - } - - #[test] - fn no_duplicate_pattern_names() { - let all = registry().values(); - let names: Vec<_> = all.iter().map(|p| p.name()).collect(); - let unique: std::collections::HashSet<_> = names.iter().collect(); - assert_eq!(names.len(), unique.len(), "duplicate pattern names found"); - } - - #[test] - fn all_patterns_have_valid_fields() { - for p in registry().values() { - assert!(!p.name().is_empty(), "pattern name is empty"); - match p.match_source() { - MatchSource::Regex(rp) => { - assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()); - assert!(rp.confidence > 0.0, "confidence is 0 for {}", p.name()); - assert!(rp.confidence <= 1.0, "confidence > 1 for {}", p.name()); - } - MatchSource::Dictionary(dp) => { - assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()); - let c = dp.confidence.resolve(0); - assert!(c > 0.0, "confidence is 0 for {}", p.name()); - assert!(c <= 1.0, "confidence > 1 for {}", p.name()); - } - } - } - } - - #[test] - fn all_regex_patterns_compile() { - for p in registry().values() { - if let MatchSource::Regex(rp) = p.match_source() { - assert!( - regex::Regex::new(&rp.regex).is_ok(), - "pattern {} failed to compile: {}", - p.name(), - rp.regex, - ); - } - } - } - - #[test] - fn all_validators_resolve() { - let resolver = crate::validators::ValidatorResolver::builtins(); - for p in registry().values() { - if let MatchSource::Regex(RegexPattern { - validator: Some(name), - .. - }) = p.match_source() - { - assert!( - resolver.resolve(name).is_some(), - "pattern {} references unregistered validator {name}", - p.name(), - ); - } - } - } - - #[test] - fn registry_insert_and_get() { - let json = br#"{ - "name": "test", - "category": "pii", - "entity_type": "government_id", - "pattern": { "regex": "\\d+", "confidence": 0.9 } - }"#; - let (pattern, _warnings) = JsonPattern::from_bytes(json).unwrap(); - - let mut reg = PatternRegistry::new(); - reg.insert(Box::new(pattern)); - - assert_eq!(reg.len(), 1); - assert_eq!(reg.get("test").unwrap().name(), "test"); - } -} +mod pattern_error; +mod pattern_registry; + +pub use self::context_rule::ContextRule; +pub use self::json_pattern::{JsonPattern, JsonPatternWarning}; +pub use self::pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; +pub(crate) use self::pattern_error::PatternLoadError; +pub use self::pattern_registry::{PatternRegistry, builtin_registry}; diff --git a/crates/nvisy-pattern/src/patterns/pattern.rs b/crates/nvisy-pattern/src/patterns/pattern.rs index 95e4b6f9..1d074d80 100644 --- a/crates/nvisy-pattern/src/patterns/pattern.rs +++ b/crates/nvisy-pattern/src/patterns/pattern.rs @@ -1,8 +1,4 @@ -//! Core [`Pattern`] trait, [`MatchSource`] enum, and [`BoxPattern`] alias. -//! -//! [`Pattern`]: crate::patterns::Pattern -//! [`MatchSource`]: crate::patterns::MatchSource -//! [`BoxPattern`]: crate::patterns::BoxPattern +//! Core [`Pattern`] trait, [`MatchSource`] enum, and [`BoxPattern`] type alias. use nvisy_ontology::entity::{EntityCategory, EntityKind}; use serde::Deserialize; @@ -14,7 +10,7 @@ use super::context_rule::ContextRule; pub struct RegexPattern { /// The regular expression string. pub regex: String, - /// Optional validator name (e.g. `"luhn"`, `"ssn"`, `"iban"`), + /// Optional validator name (e.g. `"luhn"`, `"ssn"`, `"iban"`): /// resolved at detection time via [`ValidatorResolver`]. /// /// [`ValidatorResolver`]: crate::validators::ValidatorResolver @@ -22,9 +18,9 @@ pub struct RegexPattern { pub validator: Option, /// Whether the regex is case-sensitive. /// - /// Defaults to `false`. When `false`, the regex is compiled with - /// inline `(?i)` or equivalent flag. - #[serde(default)] + /// Defaults to `true`. When `false`, the regex is compiled with + /// an inline `(?i)` prefix. + #[serde(default = "default_case_sensitive")] pub case_sensitive: bool, /// Confidence score (0.0–1.0) assigned to matches from this pattern. /// @@ -33,6 +29,19 @@ pub struct RegexPattern { pub confidence: f64, } +impl RegexPattern { + /// Return the regex string ready for compilation. + /// + /// Prepends `(?i)` when [`case_sensitive`](Self::case_sensitive) is `false`. + pub fn effective_regex(&self) -> String { + if self.case_sensitive { + self.regex.clone() + } else { + format!("(?i){}", self.regex) + } + } +} + /// Confidence for a dictionary pattern: either a single uniform score /// or per-column scores for CSV dictionaries. #[derive(Debug, Clone, PartialEq)] @@ -63,7 +72,7 @@ impl Default for DictionaryConfidence { } } -/// Serde helper — accepts either a single number or an array of numbers. +/// Serde helper: accepts either a single number or an array of numbers. mod confidence_serde { use serde::{Deserialize, Deserializer}; @@ -95,15 +104,15 @@ pub struct DictionaryPattern { pub name: String, /// Whether matching is case-sensitive. /// - /// Defaults to `false`. Controls the Aho-Corasick automaton's + /// Defaults to `false`. Controls the Aho-Corasick automaton's /// `ascii_case_insensitive` setting. #[serde(default)] pub case_sensitive: bool, /// Confidence score(s) for matches from this dictionary. /// - /// A single number applies uniformly to all entries. - /// An array assigns per-column confidence for CSV dictionaries - /// (e.g. `[0.85, 0.55]` gives column 0 entries 0.85 and column 1 + /// A single number applies uniformly to all entries. An array + /// assigns per-column confidence for CSV dictionaries (e.g. + /// `[0.85, 0.55]` gives column 0 entries 0.85 and column 1 /// entries 0.55). /// /// Defaults to `1.0` when not specified. @@ -126,6 +135,17 @@ pub enum MatchSource { Dictionary(DictionaryPattern), } +/// Default confidence score when `"confidence"` is omitted from JSON. +pub const DEFAULT_CONFIDENCE: f64 = 1.0; + +fn default_confidence() -> f64 { + DEFAULT_CONFIDENCE +} + +fn default_case_sensitive() -> bool { + true +} + /// A named detection pattern. /// /// Implementors describe a single entity type to detect, including how to @@ -137,19 +157,12 @@ pub enum MatchSource { /// from the JSON files under `assets/patterns/`. /// /// [`JsonPattern`]: super::JsonPattern -/// Default confidence score when `"confidence"` is omitted from JSON. -pub const DEFAULT_CONFIDENCE: f64 = 1.0; - -fn default_confidence() -> f64 { - DEFAULT_CONFIDENCE -} - pub trait Pattern: Send + Sync { /// Unique name identifying this pattern (e.g. `"ssn"`, `"credit-card"`). fn name(&self) -> &str; - /// High-level entity category (PII, Financial, Credentials, ...). - fn category(&self) -> &EntityCategory; + /// High-level entity category (PersonalIdentity, Financial, Credentials, ...). + fn category(&self) -> EntityCategory; /// Specific entity kind within the category (e.g. `GovernmentId`, `PaymentCard`). fn entity_kind(&self) -> EntityKind; diff --git a/crates/nvisy-pattern/src/patterns/pattern_error.rs b/crates/nvisy-pattern/src/patterns/pattern_error.rs new file mode 100644 index 00000000..8f29f1e5 --- /dev/null +++ b/crates/nvisy-pattern/src/patterns/pattern_error.rs @@ -0,0 +1,42 @@ +//! Error type for pattern filesystem loading. + +use nvisy_core::{Error, ErrorKind}; + +use super::json_pattern::JsonPatternError; + +/// Error returned when loading patterns from the filesystem. +#[derive(Debug, thiserror::Error)] +pub enum PatternLoadError { + /// The directory could not be read. + #[error("failed to read pattern directory '{}': {source}", path.display())] + ReadDir { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A pattern file could not be read. + #[error("failed to read pattern file '{}': {source}", path.display())] + ReadFile { + path: std::path::PathBuf, + source: std::io::Error, + }, + /// A pattern file failed to parse. + #[error("failed to parse pattern '{}': {source}", path.display())] + Parse { + path: std::path::PathBuf, + source: JsonPatternError, + }, +} + +impl From for Error { + fn from(err: PatternLoadError) -> Self { + let kind = match &err { + PatternLoadError::ReadDir { .. } | PatternLoadError::ReadFile { .. } => { + ErrorKind::Internal + } + PatternLoadError::Parse { .. } => ErrorKind::Validation, + }; + Error::new(kind, err.to_string()) + .with_component("nvisy-pattern::patterns") + .with_source(err) + } +} diff --git a/crates/nvisy-pattern/src/patterns/pattern_registry.rs b/crates/nvisy-pattern/src/patterns/pattern_registry.rs new file mode 100644 index 00000000..6a66818b --- /dev/null +++ b/crates/nvisy-pattern/src/patterns/pattern_registry.rs @@ -0,0 +1,405 @@ +//! [`PatternRegistry`]: named pattern collection with O(log n) lookup. + +use std::collections::BTreeMap; +use std::path::Path; +use std::sync::LazyLock; + +use include_dir::{Dir, include_dir}; + +use super::{BoxPattern, JsonPattern, JsonPatternWarning, Pattern, PatternLoadError}; +use crate::validators::ValidatorResolver; + +const TARGET: &str = "nvisy_pattern::patterns"; + +/// A registry of named [`Pattern`] definitions with O(log n) lookup. +/// +/// Use [`load_builtins`] to populate with the compile-time-embedded +/// pattern files, or [`load_dir`] / [`load_file`] to load from the +/// filesystem at runtime. +/// +/// [`load_builtins`]: Self::load_builtins +/// [`load_dir`]: Self::load_dir +/// [`load_file`]: Self::load_file +#[derive(Default)] +pub struct PatternRegistry { + inner: BTreeMap, +} + +impl std::fmt::Debug for PatternRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let names: Vec<&str> = self.inner.keys().map(|s| s.as_str()).collect(); + f.debug_struct("PatternRegistry") + .field("len", &self.inner.len()) + .field("names", &names) + .finish() + } +} + +impl PatternRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self::default() + } + + /// Insert a pattern, keyed by its [`Pattern::name`]. + pub fn insert(&mut self, pattern: BoxPattern) { + let name = pattern.name().to_owned(); + self.inner.insert(name, pattern); + } + + /// Look up a pattern by name. + #[must_use] + pub fn get(&self, name: &str) -> Option<&dyn Pattern> { + self.inner.get(name).map(|b| b.as_ref()) + } + + /// Iterate over all registered patterns as `&dyn Pattern` in + /// deterministic (alphabetical) order. + pub fn iter(&self) -> impl Iterator { + self.inner.values().map(|b| b.as_ref()) + } + + /// Iterate over all registered pattern names. + pub fn names(&self) -> impl Iterator { + self.inner.keys().map(|s| s.as_str()) + } + + /// Total number of registered patterns. + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Whether the registry contains no patterns. + #[must_use] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Load all `.json` files from the embedded `assets/patterns/` + /// directory into this registry. + /// + /// Files that fail to parse are logged as warnings and skipped. + #[tracing::instrument(target = TARGET, name = "patterns.load_builtins", skip(self), fields(count))] + pub fn load_builtins(&mut self) { + static PATTERN_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/assets/patterns"); + + let validators = ValidatorResolver::builtins(); + + for file in PATTERN_DIR.files() { + let path = file.path(); + + let Some("json") = path.extension().and_then(|e| e.to_str()) else { + tracing::warn!( + target: TARGET, + path = %path.display(), + "skipping non-JSON file in patterns directory", + ); + continue; + }; + + let (pattern, warnings) = match JsonPattern::from_bytes(file.contents(), &validators) { + Ok(pair) => pair, + Err(e) => { + tracing::warn!( + target: TARGET, + path = %path.display(), + error = %e, + "failed to load pattern, skipping", + ); + continue; + } + }; + + Self::log_warnings(&warnings); + + tracing::trace!( + target: TARGET, + name = %pattern.name(), + category = %pattern.category(), + entity_kind = %pattern.entity_kind(), + match_source = ?pattern.match_source(), + "pattern loaded", + ); + self.insert(Box::new(pattern)); + } + + tracing::Span::current().record("count", self.len()); + tracing::debug!(target: TARGET, "built-in patterns loaded"); + } + + /// Load a single `.json` pattern file and insert it. + /// + /// The pattern name is derived from the JSON `"name"` field, not + /// the file name. Files with non-`.json` extensions are logged as + /// warnings and ignored (no error is returned). + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the file cannot be read or + /// the JSON content cannot be parsed. + #[tracing::instrument(target = TARGET, name = "patterns.load_file", skip_all, fields(path = %path.as_ref().display()))] + pub fn load_file(&mut self, path: impl AsRef) -> nvisy_core::Result<()> { + let path = path.as_ref(); + + let Some("json") = path.extension().and_then(|e| e.to_str()) else { + tracing::warn!( + target: TARGET, + path = %path.display(), + "skipping non-JSON pattern file", + ); + return Ok(()); + }; + + let bytes = std::fs::read(path).map_err(|source| PatternLoadError::ReadFile { + path: path.to_owned(), + source, + })?; + + let validators = ValidatorResolver::builtins(); + let (pattern, warnings) = + JsonPattern::from_bytes(&bytes, &validators).map_err(|source| { + PatternLoadError::Parse { + path: path.to_owned(), + source, + } + })?; + + Self::log_warnings(&warnings); + + tracing::trace!( + target: TARGET, + name = %pattern.name(), + category = %pattern.category(), + entity_kind = %pattern.entity_kind(), + match_source = ?pattern.match_source(), + "pattern loaded from filesystem", + ); + self.insert(Box::new(pattern)); + Ok(()) + } + + /// Load all `.json` files from a filesystem directory. + /// + /// Non-`.json` files are logged as warnings and skipped. Loaded + /// patterns are inserted into `self`, so this can be called after + /// [`load_builtins`](Self::load_builtins) to layer user-provided + /// patterns on top of the built-ins. + /// + /// # Errors + /// + /// Returns [`nvisy_core::Error`] if the directory cannot be read, + /// a file cannot be read, or a JSON file fails to parse. + #[tracing::instrument(target = TARGET, name = "patterns.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] + pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { + let dir = dir.as_ref(); + + let entries = std::fs::read_dir(dir).map_err(|source| PatternLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + + let mut count = 0usize; + for entry in entries { + let entry = entry.map_err(|source| PatternLoadError::ReadDir { + path: dir.to_owned(), + source, + })?; + let path = entry.path(); + + if !path.is_file() { + continue; + } + + self.load_file(&path)?; + count += 1; + } + + tracing::Span::current().record("count", count); + tracing::debug!(target: TARGET, "filesystem patterns loaded"); + Ok(()) + } + + fn log_warnings(warnings: &[JsonPatternWarning]) { + for w in warnings { + match w { + JsonPatternWarning::UnknownValidator { pattern, validator } => { + tracing::warn!( + target: TARGET, + %pattern, + %validator, + "unknown validator name, pattern will have no post-match validation", + ); + } + } + } + } +} + +static BUILTIN_REGISTRY: LazyLock = LazyLock::new(|| { + let mut reg = PatternRegistry::new(); + reg.load_builtins(); + reg +}); + +/// Return a reference to the lazily-initialised built-in [`PatternRegistry`]. +pub fn builtin_registry() -> &'static PatternRegistry { + &BUILTIN_REGISTRY +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::super::json_pattern::JsonPattern; + use super::super::pattern::{MatchSource, RegexPattern}; + use super::*; + use crate::validators::ValidatorResolver; + + fn registry() -> &'static PatternRegistry { + builtin_registry() + } + + #[test] + fn builtins_load() { + assert!(!registry().is_empty()); + } + + #[test] + fn pattern_names_are_sorted() { + let names: Vec<&str> = registry().names().collect(); + let mut sorted = names.clone(); + sorted.sort(); + assert_eq!(names, sorted); + } + + #[test] + fn no_duplicate_pattern_names() { + let names: Vec<_> = registry().names().collect(); + let unique: HashSet<_> = names.iter().collect(); + assert_eq!(names.len(), unique.len(), "duplicate pattern names found"); + } + + #[test] + fn all_patterns_have_valid_fields() { + for p in registry().iter() { + assert!(!p.name().is_empty(), "pattern name is empty"); + match p.match_source() { + MatchSource::Regex(rp) => { + assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()); + assert!(rp.confidence > 0.0, "confidence is 0 for {}", p.name()); + assert!(rp.confidence <= 1.0, "confidence > 1 for {}", p.name()); + } + MatchSource::Dictionary(dp) => { + assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()); + let c = dp.confidence.resolve(0); + assert!(c > 0.0, "confidence is 0 for {}", p.name()); + assert!(c <= 1.0, "confidence > 1 for {}", p.name()); + } + } + } + } + + #[test] + fn all_regex_patterns_compile() { + for p in registry().iter() { + if let MatchSource::Regex(rp) = p.match_source() { + assert!( + regex::Regex::new(&rp.effective_regex()).is_ok(), + "pattern {} failed to compile: {}", + p.name(), + rp.regex, + ); + } + } + } + + #[test] + fn all_validators_resolve() { + let resolver = ValidatorResolver::builtins(); + for p in registry().iter() { + if let MatchSource::Regex(RegexPattern { + validator: Some(name), + .. + }) = p.match_source() + { + assert!( + resolver.resolve(name).is_some(), + "pattern {} references unregistered validator {name}", + p.name(), + ); + } + } + } + + #[test] + fn registry_insert_and_get() { + let validators = ValidatorResolver::builtins(); + let json = br#"{ + "name": "test", + "category": "personal_identity", + "entity_type": "government_id", + "pattern": { "regex": "\\d+", "confidence": 0.9 } + }"#; + let (pattern, _warnings) = JsonPattern::from_bytes(json, &validators).unwrap(); + + let mut reg = PatternRegistry::new(); + reg.insert(Box::new(pattern)); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("test").unwrap().name(), "test"); + } + + #[test] + fn load_dir_reads_filesystem() { + let dir = tempfile::tempdir().unwrap(); + + std::fs::write( + dir.path().join("test_pattern.json"), + r#"{ + "name": "test_fs", + "category": "personal_identity", + "entity_type": "government_id", + "pattern": { "regex": "\\d{3}", "confidence": 0.8 } + }"#, + ) + .unwrap(); + // Should be skipped. + std::fs::write(dir.path().join("readme.md"), "ignore me").unwrap(); + + let mut reg = PatternRegistry::new(); + reg.load_dir(dir.path()).unwrap(); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("test_fs").unwrap().name(), "test_fs"); + } + + #[test] + fn load_dir_missing_directory() { + let mut reg = PatternRegistry::new(); + let result = reg.load_dir("/nonexistent/path"); + assert!(result.is_err()); + } + + #[test] + fn load_file_single_pattern() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("single.json"); + std::fs::write( + &path, + r#"{ + "name": "single_test", + "category": "contact_info", + "entity_type": "email_address", + "pattern": { "regex": ".+@.+", "confidence": 0.7 } + }"#, + ) + .unwrap(); + + let mut reg = PatternRegistry::new(); + reg.load_file(&path).unwrap(); + + assert_eq!(reg.len(), 1); + assert_eq!(reg.get("single_test").unwrap().name(), "single_test"); + } +} diff --git a/crates/nvisy-pattern/src/prelude.rs b/crates/nvisy-pattern/src/prelude.rs index 49247186..c15685ac 100644 --- a/crates/nvisy-pattern/src/prelude.rs +++ b/crates/nvisy-pattern/src/prelude.rs @@ -4,4 +4,6 @@ //! use nvisy_pattern::prelude::*; //! ``` -pub use crate::{ContextRule, DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch}; +pub use crate::{ + AllowList, DenyList, DenyRule, PatternEngine, PatternEngineBuilder, RawMatch, ScanContext, +}; diff --git a/crates/nvisy-pattern/src/validators/luhn.rs b/crates/nvisy-pattern/src/validators/luhn.rs index 8a1bd530..0d26d890 100644 --- a/crates/nvisy-pattern/src/validators/luhn.rs +++ b/crates/nvisy-pattern/src/validators/luhn.rs @@ -2,29 +2,47 @@ //! //! Implements the [Luhn algorithm](https://en.wikipedia.org/wiki/Luhn_algorithm) //! used to validate credit/debit card numbers and other identification -//! numbers. Non-digit characters (spaces, dashes) are stripped before -//! the check. +//! numbers. Only digits, spaces, and dashes are accepted as input: any +//! other character causes the check to fail. /// Return `true` if `num` passes the Luhn checksum. /// -/// All non-digit characters are ignored, so `"4539 1488 0343 6467"`, -/// `"4539-1488-0343-6467"`, and `"4539148803436467"` are equivalent. +/// Spaces and dashes are stripped before validation, so +/// `"4539 1488 0343 6467"`, `"4539-1488-0343-6467"`, and +/// `"4539148803436467"` are all equivalent. +/// +/// Returns `false` if the input is empty or contains characters other +/// than digits, spaces, and dashes. pub fn luhn_check(num: &str) -> bool { - let digits: String = num.chars().filter(|c| c.is_ascii_digit()).collect(); + if num.is_empty() { + return false; + } + + // Reject anything that isn't a digit, space, or dash. + if !num + .chars() + .all(|c| c.is_ascii_digit() || c == ' ' || c == '-') + { + return false; + } + + let digits: Vec = num.chars().filter_map(|c| c.to_digit(10)).collect(); + if digits.is_empty() { return false; } + let mut sum = 0u32; let mut alternate = false; - for ch in digits.chars().rev() { - let mut n = ch.to_digit(10).unwrap_or(0); + for &n in digits.iter().rev() { + let mut d = n; if alternate { - n *= 2; - if n > 9 { - n -= 9; + d *= 2; + if d > 9 { + d -= 9; } } - sum += n; + sum += d; alternate = !alternate; } sum.is_multiple_of(10) @@ -57,8 +75,20 @@ mod tests { assert!(!luhn_check("abcdef")); } + #[test] + fn mixed_alpha_digit_rejected() { + assert!(!luhn_check("45abc39")); + assert!(!luhn_check("4539 14X8 0343 6467")); + } + #[test] fn single_zero() { assert!(luhn_check("0")); } + + #[test] + fn only_separators_rejected() { + assert!(!luhn_check(" ")); + assert!(!luhn_check("---")); + } } diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index bb57cd13..4c1e4762 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -1,7 +1,7 @@ //! Post-match validators for detected entity values. //! //! Patterns can reference a validator by name (e.g. `"validator": "luhn"`) -//! to reduce false positives. At detection time the name is resolved to a +//! to reduce false positives. At detection time the name is resolved to a //! [`ValidatorFn`] via [`ValidatorResolver`]. mod iban; @@ -10,22 +10,19 @@ mod ssn; use std::collections::HashMap; -pub use iban::validate_iban; -pub use luhn::luhn_check; -pub use ssn::validate_ssn; +pub use self::iban::validate_iban; +pub use self::luhn::luhn_check; +pub use self::ssn::validate_ssn; -/// Signature for a validation function: takes the matched text and returns -/// `true` if the value is valid. +/// Validation function signature: takes matched text, returns `true` if +/// the value is valid. pub type ValidatorFn = fn(&str) -> bool; /// Maps validator names to [`ValidatorFn`]s. /// -/// Created with the built-in validators via [`builtins`] (or -/// [`Default`]), then optionally extended with [`register`] for -/// custom validators. -/// -/// [`builtins`]: Self::builtins -/// [`register`]: Self::register +/// Created with the built-in validators via [`builtins`](Self::builtins) +/// (or [`Default`]), then optionally extended with +/// [`register`](Self::register) for custom validators. #[derive(Debug, Clone)] pub struct ValidatorResolver { table: HashMap<&'static str, ValidatorFn>, diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 6899fe59..f3db0c7e 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -2,8 +2,8 @@ [package] name = "nvisy-python" -description = "PyO3 bridge for AI NER/OCR detection via embedded Python" -keywords = ["nvisy", "python", "pyo3", "ner"] +description = "PyO3 bridge for Python-backed processing via embedded Python" +keywords = ["nvisy", "python", "pyo3", "exif"] categories = ["api-bindings"] readme = "README.md" @@ -35,6 +35,9 @@ hipstr = { workspace = true, features = [] } # Async runtime and parallelism tokio = { workspace = true, features = ["sync", "rt"] } +# Observability +tracing = { workspace = true, features = [] } + # Python interop pyo3 = { workspace = true, features = ["auto-initialize"] } pyo3-async-runtimes = { workspace = true, features = [] } diff --git a/crates/nvisy-python/src/bridge/error.rs b/crates/nvisy-python/src/bridge/error.rs index 3d0e6852..c6a6b62b 100644 --- a/crates/nvisy-python/src/bridge/error.rs +++ b/crates/nvisy-python/src/bridge/error.rs @@ -1,15 +1,16 @@ -//! Conversion utilities from Python errors to [`Error`]. +//! Conversion from Python errors to [`Error`]. use nvisy_core::Error; use pyo3::PyErr; use pyo3::types::PyTracebackMethods; -/// Convert a [`PyErr`] into an [`Error`], preserving the Python traceback when available. +/// Converts a [`PyErr`] into an [`Error`], preserving the Python +/// traceback when available. pub fn from_pyerr(err: PyErr) -> Error { pyo3::Python::with_gil(|py| { let traceback = err.traceback(py).map(|tb| tb.format().unwrap_or_default()); let msg = match traceback { - Some(tb) => format!("{}\n{}", err, tb), + Some(tb) => format!("{err}\n{tb}"), None => err.to_string(), }; Error::runtime(msg, "python", false) diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index 9e9a6142..1c381dba 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -1,57 +1,88 @@ //! Lightweight handle to a Python module loaded via PyO3. //! -//! Provides [`PythonBridge`] — a thin wrapper that remembers which Python -//! module to import — plus helpers for calling synchronous and asynchronous +//! Provides [`PythonBridge`]: a thin wrapper that remembers which Python +//! module to import, plus helpers for calling synchronous and asynchronous //! Python functions from Rust async code. mod error; -pub use error::from_pyerr; use hipstr::HipStr; use nvisy_core::Error; use pyo3::prelude::*; use pyo3::types::PyDict; use serde_json::Value; -/// Lightweight handle to a Python NER module. +pub use self::error::from_pyerr; + +const TARGET: &str = "nvisy_python::bridge"; + +/// Lightweight handle to a Python module. /// -/// The bridge does **not** hold the GIL or any Python objects; it simply -/// remembers which module to `import` when a detection function is called. +/// The bridge does **not** hold the GIL or any Python objects: it simply +/// remembers which module to `import` when a function is called. /// The default module name is `"nvisy_ai"`. #[derive(Clone)] pub struct PythonBridge { - /// Dotted Python module name to import (e.g., `"nvisy_ai"`). + /// Dotted Python module name to import (e.g. `"nvisy_ai"`). module_name: HipStr<'static>, } +impl std::fmt::Debug for PythonBridge { + /// Formats the bridge for debugging, showing only the module name. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PythonBridge") + .field("module_name", &self.module_name.as_str()) + .finish() + } +} + impl PythonBridge { - /// Create a new bridge that will load the given Python module. + /// Creates a new bridge that will load the given Python module. pub fn new(module_name: impl Into>) -> Self { Self { module_name: module_name.into(), } } - /// Initialize Python and verify the module can be imported. + /// Initializes Python and verifies the module can be imported. + /// + /// # Errors + /// + /// Returns an error if the Python interpreter cannot be started or + /// the module cannot be imported. + #[tracing::instrument(target = TARGET, name = "bridge.init", skip(self), fields(module = %self.module_name))] pub fn init(&self) -> Result<(), Error> { Python::with_gil(|py| { py.import(&*self.module_name).map_err(from_pyerr)?; + tracing::debug!(target: TARGET, "python module imported"); Ok(()) }) } - /// Get the module name. + /// Returns the dotted Python module name. + #[must_use] pub fn module_name(&self) -> &str { &self.module_name } - /// Call a **synchronous** Python method on the bridge module inside + /// Calls a **synchronous** Python method on the bridge module inside /// `spawn_blocking` + `Python::with_gil`. /// - /// `build_kwargs` receives a GIL token and must return a [`PyDict`] of - /// keyword arguments. The method is invoked as - /// `module.(**, kwargs)` and the return value is deserialized + /// `build_kwargs` receives a GIL token and must return a [`PyDict`] + /// of keyword arguments. The method is invoked as + /// `module.(**kwargs)` and the return value is deserialized /// into `Vec`. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "bridge.call_sync", + skip(self, build_kwargs), + fields(module = %self.module_name, method), + )] pub async fn call_sync(&self, method: &str, build_kwargs: F) -> Result, Error> where F: FnOnce(Python<'_>) -> Result, Error> + Send + 'static, @@ -59,6 +90,8 @@ impl PythonBridge { let module_name = self.module_name.clone(); let method = method.to_string(); + tracing::Span::current().record("method", &method); + tokio::task::spawn_blocking(move || { Python::with_gil(|py| { let module = py.import(&*module_name).map_err(from_pyerr)?; @@ -70,7 +103,7 @@ impl PythonBridge { pythonize::depythonize::>(&result).map_err(|e| { Error::runtime( - format!("Failed to deserialize {} result: {}", method, e), + format!("failed to deserialize {method} result: {e}"), "python", false, ) @@ -78,17 +111,28 @@ impl PythonBridge { }) }) .await - .map_err(|e| Error::runtime(format!("Task join error: {}", e), "python", false))? + .map_err(|e| Error::runtime(format!("blocking task panicked: {e}"), "python", false))? } - /// Call an **asynchronous** (coroutine) Python method on the bridge + /// Calls an **asynchronous** (coroutine) Python method on the bridge /// module. /// /// Acquires the GIL, invokes `module.(**kwargs)` to obtain a /// Python coroutine, converts it to a Rust [`Future`] via /// [`pyo3_async_runtimes::tokio::into_future`], and awaits it on the - /// Tokio runtime. The coroutine's return value is deserialized into + /// Tokio runtime. The coroutine's return value is deserialized into /// `Vec`. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "bridge.call_async", + skip(self, build_kwargs), + fields(module = %self.module_name, method), + )] pub async fn call_async(&self, method: &str, build_kwargs: F) -> Result, Error> where F: FnOnce(Python<'_>) -> Result, Error> + Send + 'static, @@ -96,6 +140,8 @@ impl PythonBridge { use std::future::Future; use std::pin::Pin; + tracing::Span::current().record("method", method); + let future: Pin> + Send>> = Python::with_gil(|py| -> Result<_, Error> { let module = py.import(&*self.module_name).map_err(from_pyerr)?; @@ -115,7 +161,7 @@ impl PythonBridge { Python::with_gil(|py| { pythonize::depythonize::>(py_result.bind(py)).map_err(|e| { Error::runtime( - format!("Failed to deserialize {} result: {}", method, e), + format!("failed to deserialize {method} result: {e}"), "python", false, ) @@ -125,6 +171,7 @@ impl PythonBridge { } impl Default for PythonBridge { + /// Creates a bridge with the default module name `"nvisy_ai"`. fn default() -> Self { Self::new("nvisy_ai") } diff --git a/crates/nvisy-python/src/exif/mod.rs b/crates/nvisy-python/src/exif/mod.rs new file mode 100644 index 00000000..7eac7d8b --- /dev/null +++ b/crates/nvisy-python/src/exif/mod.rs @@ -0,0 +1,14 @@ +//! EXIF metadata extraction via the Python backend. +//! +//! Provides [`ExifModule`]: a configured handle that calls +//! `nvisy_ai.extract_exif()` through the [`PythonBridge`] +//! to extract EXIF metadata from images. Returns raw JSON values: +//! metadata construction is handled by the caller. +//! +//! [`PythonBridge`]: crate::bridge::PythonBridge + +mod module; +mod params; + +pub use self::module::ExifModule; +pub use self::params::ExifParams; diff --git a/crates/nvisy-python/src/exif/module.rs b/crates/nvisy-python/src/exif/module.rs new file mode 100644 index 00000000..db833f2f --- /dev/null +++ b/crates/nvisy-python/src/exif/module.rs @@ -0,0 +1,134 @@ +//! [`ExifModule`]: EXIF extraction via the Python bridge. + +use nvisy_core::Error; +use nvisy_core::content::ContentData; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use serde_json::Value; + +use super::params::ExifParams; +use crate::bridge::{PythonBridge, from_pyerr}; + +const TARGET: &str = "nvisy_python::exif"; + +/// Configured handle for EXIF metadata extraction. +/// +/// Holds a [`PythonBridge`] and [`ExifParams`] so callers do not need +/// to pass them on every invocation. +#[derive(Debug, Clone)] +pub struct ExifModule { + /// Python bridge used to call into the `nvisy_ai` module. + bridge: PythonBridge, + /// Extraction parameters applied to every call. + params: ExifParams, +} + +impl ExifModule { + /// Creates a new module with the given bridge and parameters. + pub fn new(bridge: PythonBridge, params: ExifParams) -> Self { + Self { bridge, params } + } + + /// Returns a reference to the underlying bridge. + #[must_use] + pub fn bridge(&self) -> &PythonBridge { + &self.bridge + } + + /// Returns a reference to the current parameters. + #[must_use] + pub fn params(&self) -> &ExifParams { + &self.params + } + + /// Calls Python `extract_exif()` synchronously via `spawn_blocking`. + /// + /// Returns raw JSON dicts containing EXIF tag key-value pairs. + /// The MIME type is resolved from `content.content_type()`, + /// defaulting to `"application/octet-stream"` when unavailable. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "exif.extract", + skip(self, content), + fields(data_len = content.size()), + )] + pub async fn extract(&self, content: ContentData) -> Result, Error> { + let request = ExifRequest::new(content, self.params); + + self.bridge + .call_sync("extract_exif", move |py| request.to_kwargs(py)) + .await + } + + /// Calls Python `extract_exif()` as a **coroutine** (async Python + /// function). + /// + /// Returns raw JSON dicts containing EXIF tag key-value pairs. + /// The MIME type is resolved from `content.content_type()`, + /// defaulting to `"application/octet-stream"` when unavailable. + /// + /// # Errors + /// + /// Returns an error if the Python call fails or the return value + /// cannot be deserialized. + #[tracing::instrument( + target = TARGET, + name = "exif.extract_async", + skip(self, content), + fields(data_len = content.size()), + )] + pub async fn extract_async(&self, content: ContentData) -> Result, Error> { + let request = ExifRequest::new(content, self.params); + + self.bridge + .call_async("extract_exif", move |py| request.to_kwargs(py)) + .await + } +} + +/// Owned snapshot of a single EXIF extraction request. +/// +/// Wraps [`ContentData`] and [`ExifParams`] so they can be moved into +/// a `Send + 'static` closure for the bridge call. No extra allocations: +/// `ContentData` is internally arc-backed. +struct ExifRequest { + /// Content to extract EXIF metadata from. + content: ContentData, + /// Extraction parameters. + params: ExifParams, +} + +impl ExifRequest { + /// Creates a new request from content data and parameters. + fn new(content: ContentData, params: ExifParams) -> Self { + Self { content, params } + } + + /// Converts the request into a Python keyword arguments dict. + fn to_kwargs<'py>(&self, py: Python<'py>) -> Result, Error> { + let mime_type = self + .content + .content_type() + .unwrap_or("application/octet-stream"); + + let kwargs = PyDict::new(py); + kwargs + .set_item("image_bytes", self.content.as_bytes()) + .map_err(from_pyerr)?; + kwargs + .set_item("mime_type", mime_type) + .map_err(from_pyerr)?; + kwargs + .set_item("include_gps", self.params.include_gps) + .map_err(from_pyerr)?; + kwargs + .set_item("include_thumbnail", self.params.include_thumbnail) + .map_err(from_pyerr)?; + Ok(kwargs) + } +} diff --git a/crates/nvisy-python/src/exif/params.rs b/crates/nvisy-python/src/exif/params.rs new file mode 100644 index 00000000..0352b201 --- /dev/null +++ b/crates/nvisy-python/src/exif/params.rs @@ -0,0 +1,19 @@ +//! [`ExifParams`]: configuration for EXIF extraction calls. + +/// Parameters for EXIF extraction. +#[derive(Debug, Clone, Copy)] +pub struct ExifParams { + /// Whether to include GPS coordinates in the output. + pub include_gps: bool, + /// Whether to include thumbnail data in the output. + pub include_thumbnail: bool, +} + +impl Default for ExifParams { + fn default() -> Self { + Self { + include_gps: true, + include_thumbnail: false, + } + } +} diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index 96ee78ef..54c30782 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -3,9 +3,7 @@ #![doc = include_str!("../README.md")] pub mod bridge; -pub mod ner; -pub mod ocr; -pub mod transcribe; +pub mod exif; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs deleted file mode 100644 index 71d568e7..00000000 --- a/crates/nvisy-python/src/ner/mod.rs +++ /dev/null @@ -1,139 +0,0 @@ -//! Named-entity recognition (NER) detection via a Python AI backend. -//! -//! Functions in this module call into the Python `nvisy_ai` module via -//! [`PythonBridge`] and return raw JSON values. Entity construction is -//! handled by the pipeline's `NerBackend` / `DetectNerAction` layer. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for NER detection, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct NerParams { - /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). - pub entity_types: Vec, - /// Minimum confidence score to include a detection (0.0 -- 1.0). - pub confidence_threshold: f64, -} - -/// Call Python `detect_ner()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner( - bridge: &PythonBridge, - text: &str, - params: &NerParams, -) -> Result, Error> { - let text = text.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ner", move |py| { - let kwargs = PyDict::new(py); - kwargs.set_item("text", &text).map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner_image()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_image( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &NerParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ner_image", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_async( - bridge: &PythonBridge, - text: &str, - params: &NerParams, -) -> Result, Error> { - let text = text.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ner", move |py| { - let kwargs = PyDict::new(py); - kwargs.set_item("text", &text).map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ner_image()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ner_image_async( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &NerParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ner_image", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("entity_types", ¶ms.entity_types) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs deleted file mode 100644 index 87c12b36..00000000 --- a/crates/nvisy-python/src/ocr/mod.rs +++ /dev/null @@ -1,96 +0,0 @@ -//! OCR text extraction via the Python backend. -//! -//! Calls `nvisy_ai.detect_ocr()` through the Python bridge to perform -//! optical character recognition on images, returning raw JSON values. -//! Entity construction is handled by the pipeline's `OcrBackend` / -//! `GenerateOcrAction` layer. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for OCR detection, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct OcrParams { - /// Language hint (e.g. `"eng"` for English). - pub language: String, - /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). - pub engine: String, - /// Minimum confidence threshold for OCR results. - pub confidence_threshold: f64, -} - -/// Call Python `detect_ocr()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ocr( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &OcrParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("detect_ocr", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item("engine", ¶ms.engine) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `detect_ocr()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn detect_ocr_async( - bridge: &PythonBridge, - image_data: &[u8], - mime_type: &str, - params: &OcrParams, -) -> Result, Error> { - let image_data = image_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("detect_ocr", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("image_bytes", &image_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item("engine", ¶ms.engine) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index e3b932b9..60c66244 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -1,4 +1,4 @@ //! Convenience re-exports. + pub use crate::bridge::PythonBridge; -pub use crate::ner::NerParams; -pub use crate::ocr::OcrParams; +pub use crate::exif::{ExifModule, ExifParams}; diff --git a/crates/nvisy-python/src/transcribe/mod.rs b/crates/nvisy-python/src/transcribe/mod.rs deleted file mode 100644 index 54dc337c..00000000 --- a/crates/nvisy-python/src/transcribe/mod.rs +++ /dev/null @@ -1,100 +0,0 @@ -//! Speech-to-text transcription via the Python backend. -//! -//! Calls `nvisy_ai.transcribe()` through the Python bridge to perform -//! speech transcription on audio, returning raw JSON values. - -use nvisy_core::Error; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use serde_json::Value; - -use crate::bridge::{PythonBridge, from_pyerr}; - -/// Parameters for transcription, independent of any pipeline types. -#[derive(Debug, Clone)] -pub struct TranscribeParams { - /// BCP-47 language tag for transcription. - pub language: String, - /// Whether to perform speaker diarization. - pub enable_speaker_diarization: bool, - /// Minimum confidence threshold for results. - pub confidence_threshold: f64, -} - -/// Call Python `transcribe()` synchronously via `spawn_blocking`. -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn transcribe( - bridge: &PythonBridge, - audio_data: &[u8], - mime_type: &str, - params: &TranscribeParams, -) -> Result, Error> { - let audio_data = audio_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_sync("transcribe", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("audio_bytes", &audio_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item( - "enable_speaker_diarization", - params.enable_speaker_diarization, - ) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} - -/// Call Python `transcribe()` as a **coroutine** (async Python function). -/// -/// Returns raw JSON dicts — no domain-type construction. -pub async fn transcribe_async( - bridge: &PythonBridge, - audio_data: &[u8], - mime_type: &str, - params: &TranscribeParams, -) -> Result, Error> { - let audio_data = audio_data.to_vec(); - let mime_type = mime_type.to_string(); - let params = params.clone(); - - bridge - .call_async("transcribe", move |py| { - let kwargs = PyDict::new(py); - kwargs - .set_item("audio_bytes", &audio_data[..]) - .map_err(from_pyerr)?; - kwargs - .set_item("mime_type", &mime_type) - .map_err(from_pyerr)?; - kwargs - .set_item("language", ¶ms.language) - .map_err(from_pyerr)?; - kwargs - .set_item( - "enable_speaker_diarization", - params.enable_speaker_diarization, - ) - .map_err(from_pyerr)?; - kwargs - .set_item("confidence_threshold", params.confidence_threshold) - .map_err(from_pyerr)?; - Ok(kwargs) - }) - .await -} diff --git a/crates/nvisy-registry/Cargo.toml b/crates/nvisy-registry/Cargo.toml index bb383c18..399ca45d 100644 --- a/crates/nvisy-registry/Cargo.toml +++ b/crates/nvisy-registry/Cargo.toml @@ -33,6 +33,9 @@ fjall = { workspace = true, features = [] } # Async runtime and parallelism tokio = { workspace = true, features = ["sync", "rt"] } +# Observability +tracing = { workspace = true, features = [] } + # (De)serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-registry/src/handler/content.rs b/crates/nvisy-registry/src/handler/content.rs new file mode 100644 index 00000000..55e29dda --- /dev/null +++ b/crates/nvisy-registry/src/handler/content.rs @@ -0,0 +1,154 @@ +//! [`ContentHandle`]: async handle to stored content data and metadata. + +use std::fmt; + +use bytes::Bytes; +use fjall::Keyspace; +use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; +use nvisy_core::{Error, ErrorKind, Result}; +use uuid::Uuid; + +use crate::registry::composite_key; + +const COMPONENT: &str = "nvisy-registry::content"; + +/// Lightweight handle to a content entry stored in the registry. +/// +/// Holds references to the fjall keyspaces so it can read content data +/// and metadata on demand. Cloning is cheap: fjall handles are +/// internally `Arc`-wrapped. +#[derive(Clone)] +pub struct ContentHandle { + /// Actor identity that owns this content entry. + actor_id: Uuid, + /// Source identifier for the stored content. + content_source: ContentSource, + /// Keyspace storing raw content bytes. + content_ks: Keyspace, + /// Keyspace storing serialized content metadata. + content_meta_ks: Keyspace, +} + +impl fmt::Debug for ContentHandle { + /// Formats the handle for debugging, omitting keyspace internals. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContentHandle") + .field("actor_id", &self.actor_id) + .field("content_source", &self.content_source) + .finish_non_exhaustive() + } +} + +impl ContentHandle { + /// Creates a new handle from pre-resolved keyspaces. + /// + /// This is `pub(crate)` because only [`Registry`](crate::Registry) + /// should construct handles after verifying the entry exists. + pub(crate) fn new( + actor_id: Uuid, + content_source: ContentSource, + content_ks: Keyspace, + content_meta_ks: Keyspace, + ) -> Self { + Self { + actor_id, + content_source, + content_ks, + content_meta_ks, + } + } + + /// Returns the content source identifier. + #[must_use] + pub fn content_source(&self) -> ContentSource { + self.content_source + } + + /// Returns the actor ID that owns this content. + #[must_use] + pub fn actor_id(&self) -> Uuid { + self.actor_id + } + + /// Reads the content bytes from the store. + /// + /// The read is dispatched to a blocking thread via + /// [`spawn_blocking`](tokio::task::spawn_blocking) to avoid + /// blocking the async runtime on fjall I/O. + #[tracing::instrument( + target = COMPONENT, + name = "content.read_data", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.content_source.as_uuid()), + )] + pub async fn content_data(&self) -> Result { + let key = composite_key(self.actor_id, self.content_source.as_uuid()); + let source = self.content_source; + let ks = self.content_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read content data") + .with_component(COMPONENT) + .with_source(err) + })?; + + let guard = value.ok_or_else(|| { + Error::new( + ErrorKind::NotFound, + format!("content data not found: {}", source.as_uuid()), + ) + .with_component(COMPONENT) + })?; + + Ok(ContentData::new(source, Bytes::copy_from_slice(&guard))) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Reads the content metadata from the store. + /// + /// Returns [`ContentMetadata::default()`] when the metadata key + /// exists but has no value (e.g. content registered without metadata). + #[tracing::instrument( + target = COMPONENT, + name = "content.read_metadata", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.content_source.as_uuid()), + )] + pub async fn metadata(&self) -> Result { + let key = composite_key(self.actor_id, self.content_source.as_uuid()); + let ks = self.content_meta_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + + match value { + Some(guard) => serde_json::from_slice(&guard).map_err(|err| { + Error::new( + ErrorKind::Serialization, + "failed to deserialize content metadata", + ) + .with_component(COMPONENT) + .with_source(err) + }), + None => Ok(ContentMetadata::default()), + } + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } +} diff --git a/crates/nvisy-registry/src/handler/context.rs b/crates/nvisy-registry/src/handler/context.rs new file mode 100644 index 00000000..77cc3c91 --- /dev/null +++ b/crates/nvisy-registry/src/handler/context.rs @@ -0,0 +1,104 @@ +//! [`ContextHandle`]: async handle to a stored detection context. + +use std::fmt; + +use fjall::Keyspace; +use nvisy_core::content::ContentSource; +use nvisy_core::{Error, ErrorKind, Result}; +use nvisy_ontology::context::Context; +use uuid::Uuid; + +use crate::registry::composite_key; + +const COMPONENT: &str = "nvisy-registry::context"; + +/// Lightweight handle to a context entry stored in the registry. +/// +/// Holds a reference to the contexts keyspace so it can deserialize the +/// stored JSON on demand. Cloning is cheap: fjall handles are +/// internally `Arc`-wrapped. +#[derive(Clone)] +pub struct ContextHandle { + /// Actor identity that owns this context entry. + actor_id: Uuid, + /// Content source this context is associated with. + source: ContentSource, + /// Keyspace storing serialized context JSON. + contexts_ks: Keyspace, +} + +impl fmt::Debug for ContextHandle { + /// Formats the handle for debugging, omitting keyspace internals. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContextHandle") + .field("actor_id", &self.actor_id) + .field("source", &self.source) + .finish_non_exhaustive() + } +} + +impl ContextHandle { + /// Creates a new handle from a pre-resolved keyspace. + /// + /// This is `pub(crate)` because only [`Registry`](crate::Registry) + /// should construct handles after verifying the entry exists. + pub(crate) fn new(actor_id: Uuid, source: ContentSource, contexts_ks: Keyspace) -> Self { + Self { + actor_id, + source, + contexts_ks, + } + } + + /// Returns the content source identifier. + #[must_use] + pub fn source(&self) -> ContentSource { + self.source + } + + /// Returns the actor ID that owns this context. + #[must_use] + pub fn actor_id(&self) -> Uuid { + self.actor_id + } + + /// Reads and deserializes the context from the store. + /// + /// The read is dispatched to a blocking thread via + /// [`spawn_blocking`](tokio::task::spawn_blocking) to avoid + /// blocking the async runtime on fjall I/O. + #[tracing::instrument( + target = COMPONENT, + name = "context.read", + skip(self), + fields(actor_id = %self.actor_id, source_id = %self.source.as_uuid()), + )] + pub async fn context(&self) -> Result { + let key = composite_key(self.actor_id, self.source.as_uuid()); + let ks = self.contexts_ks.clone(); + + tokio::task::spawn_blocking(move || -> Result { + let value = ks.get(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to read context") + .with_component(COMPONENT) + .with_source(err) + })?; + + let guard = value.ok_or_else(|| { + Error::new(ErrorKind::NotFound, "context data not found").with_component(COMPONENT) + })?; + + serde_json::from_slice(&guard).map_err(|err| { + Error::new(ErrorKind::Serialization, "failed to deserialize context") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } +} diff --git a/crates/nvisy-registry/src/handler/mod.rs b/crates/nvisy-registry/src/handler/mod.rs new file mode 100644 index 00000000..b8e68fbe --- /dev/null +++ b/crates/nvisy-registry/src/handler/mod.rs @@ -0,0 +1,7 @@ +//! Async handles for reading stored content and contexts. + +mod content; +mod context; + +pub use self::content::ContentHandle; +pub use self::context::ContextHandle; diff --git a/crates/nvisy-registry/src/lib.rs b/crates/nvisy-registry/src/lib.rs index f8fa9531..a45658ca 100644 --- a/crates/nvisy-registry/src/lib.rs +++ b/crates/nvisy-registry/src/lib.rs @@ -1,19 +1,12 @@ -//! Actor-scoped content and context storage backed by fjall. -//! -//! This crate provides [`Registry`], a unified store that manages both -//! content files and detection contexts. Every resource is scoped by a -//! `Uuid` actor identity, so listing and reading are inherently -//! actor-isolated at the database level via composite keys. -//! -//! # Core Types -//! -//! - [`Registry`]: Shared, clonable handle to the fjall database -//! - [`ContentHandle`]: Lightweight async handle to stored content -//! - [`ContextHandle`]: Lightweight async handle to a stored context +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] -mod store; +mod handler; +mod registry; #[doc(hidden)] pub mod prelude; -pub use store::{ContentHandle, ContextHandle, Registry}; +pub use self::handler::{ContentHandle, ContextHandle}; +pub use self::registry::Registry; diff --git a/crates/nvisy-registry/src/registry/mod.rs b/crates/nvisy-registry/src/registry/mod.rs new file mode 100644 index 00000000..31e1b438 --- /dev/null +++ b/crates/nvisy-registry/src/registry/mod.rs @@ -0,0 +1,867 @@ +//! [`Registry`]: actor-scoped content and context store backed by fjall. + +use std::path::{Path, PathBuf}; + +use fjall::{Database, Keyspace, KeyspaceCreateOptions, KvSeparationOptions}; +use nvisy_core::content::{Content, ContentSource}; +use nvisy_core::{Error, ErrorKind, Result}; +use nvisy_ontology::context::Context; +use uuid::Uuid; + +use crate::handler::{ContentHandle, ContextHandle}; + +const TARGET: &str = "nvisy_registry"; +const COMPONENT: &str = "nvisy-registry"; + +/// Builds a 32-byte composite key: `[actor_id: 16][resource_id: 16]`. +/// +/// Used by both [`ContentHandle`] and [`ContextHandle`] to scope every +/// read/write to a specific actor. +pub(crate) fn composite_key(actor_id: Uuid, resource_id: Uuid) -> [u8; 32] { + let mut key = [0u8; 32]; + key[..16].copy_from_slice(actor_id.as_bytes()); + key[16..].copy_from_slice(resource_id.as_bytes()); + key +} + +/// Actor-scoped content and context store backed by fjall. +/// +/// Stores content data, content metadata, and contexts in three keyspaces. +/// Every key is a 32-byte composite of `[actor_id][resource_id]`, so all +/// operations are inherently scoped to a single actor. +/// +/// All handles are internally `Arc`-wrapped, making `Registry` cheap to +/// clone and safe to share across threads. +#[derive(Clone)] +pub struct Registry { + /// Filesystem path where the fjall database is stored. + base_dir: PathBuf, + /// Underlying fjall database handle. + db: Database, + /// Keyspace for raw content bytes (blob-separated). + content_ks: Keyspace, + /// Keyspace for serialized content metadata. + content_meta_ks: Keyspace, + /// Keyspace for serialized detection contexts. + contexts_ks: Keyspace, +} + +impl std::fmt::Debug for Registry { + /// Formats the registry for debugging, showing only the base directory. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Registry") + .field("base_dir", &self.base_dir) + .finish_non_exhaustive() + } +} + +impl Registry { + /// Opens (or creates) the fjall database at `path`. + /// + /// Three keyspaces are created: + /// - `"content"`: blob separation for efficient large-value storage + /// - `"content_meta"`: default configuration + /// - `"contexts"`: default configuration + /// + /// # Errors + /// + /// Returns an error if the database or keyspaces cannot be opened. + #[tracing::instrument(target = TARGET, name = "registry.open", fields(path = %path.as_ref().display()))] + pub fn open(path: impl AsRef) -> Result { + let base_dir = path.as_ref().to_path_buf(); + + let db = Database::builder(&base_dir).open().map_err(|err| { + Error::new( + ErrorKind::Internal, + format!("failed to open database: {}", base_dir.display()), + ) + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_ks = db + .keyspace("content", || { + KeyspaceCreateOptions::default() + .with_kv_separation(Some(KvSeparationOptions::default())) + }) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open content keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_meta_ks = db + .keyspace("content_meta", KeyspaceCreateOptions::default) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open content_meta keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + let contexts_ks = db + .keyspace("contexts", KeyspaceCreateOptions::default) + .map_err(|err| { + Error::new(ErrorKind::Internal, "failed to open contexts keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + + tracing::debug!(target: TARGET, "registry opened"); + + Ok(Self { + base_dir, + db, + content_ks, + content_meta_ks, + contexts_ks, + }) + } + + // -- Content operations -------------------------------------------------- + + /// Registers content, writing its bytes and metadata to the store. + /// + /// Returns a [`ContentHandle`] for subsequent reads. + /// + /// # Errors + /// + /// Returns an error if serialization or the underlying write fails. + #[tracing::instrument( + target = TARGET, + name = "registry.register_content", + skip(self, content), + fields(actor_id = %actor_id), + )] + pub async fn register_content( + &self, + actor_id: Uuid, + content: Content, + ) -> Result { + let content_source = content.content_source(); + let key = composite_key(actor_id, content_source.as_uuid()); + let data = content.as_bytes().to_vec(); + + let (_, content_metadata) = content.into_parts(); + let meta_bytes = + serde_json::to_vec(&content_metadata.unwrap_or_default()).map_err(|err| { + Error::new( + ErrorKind::Serialization, + "failed to serialize content metadata", + ) + .with_component(COMPONENT) + .with_source(err) + })?; + + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + content_ks.insert(key, &data).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.insert(key, &meta_bytes).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::trace!( + target: TARGET, + source_id = %content_source.as_uuid(), + "content registered", + ); + + Ok(ContentHandle::new( + actor_id, + content_source, + self.content_ks.clone(), + self.content_meta_ks.clone(), + )) + } + + /// Looks up previously registered content by actor and content ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.read_content", + skip(self), + fields(actor_id = %actor_id, content_id = %content_id), + )] + pub async fn read_content(&self, actor_id: Uuid, content_id: Uuid) -> Result { + let key = composite_key(actor_id, content_id); + let ks = self.content_ks.clone(); + + let exists = tokio::task::spawn_blocking(move || -> Result { + ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check content key") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("content not found: actor_id={actor_id}, content_id={content_id}"), + ) + .with_component(COMPONENT)); + } + + let source = ContentSource::from(content_id); + Ok(ContentHandle::new( + actor_id, + source, + self.content_ks.clone(), + self.content_meta_ks.clone(), + )) + } + + /// Removes a single content entry (data + metadata) by actor and content ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_content", + skip(self), + fields(actor_id = %actor_id, content_id = %content_id), + )] + pub async fn unregister_content(&self, actor_id: Uuid, content_id: Uuid) -> Result<()> { + let key = composite_key(actor_id, content_id); + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + let exists = content_ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check content key") + .with_component(COMPONENT) + .with_source(err) + })?; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("content not found: actor_id={actor_id}, content_id={content_id}"), + ) + .with_component(COMPONENT)); + } + + content_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Removes all content entries (data + metadata) for an actor. + /// + /// Returns the number of entries removed. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_all_content", + skip(self), + fields(actor_id = %actor_id, removed), + )] + pub async fn unregister_all_content(&self, actor_id: Uuid) -> Result { + let prefix = actor_id.as_bytes().to_vec(); + let content_ks = self.content_ks.clone(); + let meta_ks = self.content_meta_ks.clone(); + let db = self.db.clone(); + + let count = tokio::task::spawn_blocking(move || -> Result { + let keys = collect_prefix_keys(&content_ks, &prefix)?; + let count = keys.len(); + + for key in &keys { + content_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content data") + .with_component(COMPONENT) + .with_source(err) + })?; + meta_ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove content metadata") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + if count > 0 { + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + Ok(count) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::Span::current().record("removed", count); + Ok(count) + } + + /// Lists all content IDs for an actor, sorted in ascending order. + #[tracing::instrument( + target = TARGET, + name = "registry.list_content", + skip(self), + fields(actor_id = %actor_id), + )] + pub async fn list_content(&self, actor_id: Uuid) -> Result> { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.content_ks.clone(); + + tokio::task::spawn_blocking(move || extract_resource_ids(&ks, &prefix)) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + // -- Context operations -------------------------------------------------- + + /// Registers a context, serializing it as JSON. + /// + /// Returns a [`ContextHandle`] for subsequent reads. + /// + /// # Errors + /// + /// Returns an error if serialization or the underlying write fails. + #[tracing::instrument( + target = TARGET, + name = "registry.register_context", + skip(self, context), + fields(actor_id = %actor_id), + )] + pub async fn register_context( + &self, + actor_id: Uuid, + context: Context, + ) -> Result { + let source = context.source; + let key = composite_key(actor_id, source.as_uuid()); + + let json_bytes = serde_json::to_vec(&context).map_err(|err| { + Error::new(ErrorKind::Serialization, "failed to serialize context") + .with_component(COMPONENT) + .with_source(err) + })?; + + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + ks.insert(key, &json_bytes).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to write context") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::trace!( + target: TARGET, + source_id = %source.as_uuid(), + "context registered", + ); + + Ok(ContextHandle::new( + actor_id, + source, + self.contexts_ks.clone(), + )) + } + + /// Looks up a previously registered context by actor and context ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.read_context", + skip(self), + fields(actor_id = %actor_id, context_id = %context_id), + )] + pub async fn read_context(&self, actor_id: Uuid, context_id: Uuid) -> Result { + let key = composite_key(actor_id, context_id); + let ks = self.contexts_ks.clone(); + + let exists = tokio::task::spawn_blocking(move || -> Result { + ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check context key") + .with_component(COMPONENT) + .with_source(err) + }) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("context not found: actor_id={actor_id}, context_id={context_id}"), + ) + .with_component(COMPONENT)); + } + + let source = ContentSource::from(context_id); + Ok(ContextHandle::new( + actor_id, + source, + self.contexts_ks.clone(), + )) + } + + /// Removes a single context entry by actor and context ID. + /// + /// # Errors + /// + /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_context", + skip(self), + fields(actor_id = %actor_id, context_id = %context_id), + )] + pub async fn unregister_context(&self, actor_id: Uuid, context_id: Uuid) -> Result<()> { + let key = composite_key(actor_id, context_id); + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + tokio::task::spawn_blocking(move || -> Result<()> { + let exists = ks.contains_key(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to check context key") + .with_component(COMPONENT) + .with_source(err) + })?; + + if !exists { + return Err(Error::new( + ErrorKind::NotFound, + format!("context not found: actor_id={actor_id}, context_id={context_id}"), + ) + .with_component(COMPONENT)); + } + + ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove context") + .with_component(COMPONENT) + .with_source(err) + })?; + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(()) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Removes all context entries for an actor. + /// + /// Returns the number of entries removed. + #[tracing::instrument( + target = TARGET, + name = "registry.unregister_all_contexts", + skip(self), + fields(actor_id = %actor_id, removed), + )] + pub async fn unregister_all_contexts(&self, actor_id: Uuid) -> Result { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.contexts_ks.clone(); + let db = self.db.clone(); + + let count = tokio::task::spawn_blocking(move || -> Result { + let keys = collect_prefix_keys(&ks, &prefix)?; + let count = keys.len(); + + for key in &keys { + ks.remove(key).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to remove context") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + if count > 0 { + db.persist(fjall::PersistMode::SyncAll).map_err(|err| { + Error::new(ErrorKind::Internal, "failed to persist database") + .with_component(COMPONENT) + .with_source(err) + })?; + } + + Ok(count) + }) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })??; + + tracing::Span::current().record("removed", count); + Ok(count) + } + + /// Lists all context IDs for an actor, sorted in ascending order. + #[tracing::instrument( + target = TARGET, + name = "registry.list_contexts", + skip(self), + fields(actor_id = %actor_id), + )] + pub async fn list_contexts(&self, actor_id: Uuid) -> Result> { + let prefix = actor_id.as_bytes().to_vec(); + let ks = self.contexts_ks.clone(); + + tokio::task::spawn_blocking(move || extract_resource_ids(&ks, &prefix)) + .await + .map_err(|err| { + Error::new(ErrorKind::Internal, "blocking task panicked") + .with_component(COMPONENT) + .with_source(err) + })? + } + + /// Returns the base directory path where the database is stored. + #[must_use] + pub fn base_dir(&self) -> &Path { + &self.base_dir + } +} + +/// Collects all raw keys from a keyspace that share the given prefix. +fn collect_prefix_keys(ks: &Keyspace, prefix: &[u8]) -> Result>> { + ks.prefix(prefix) + .map(|guard| { + let key = guard.key().map_err(|err| { + Error::new(ErrorKind::Internal, "failed to iterate keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + Ok(key.to_vec()) + }) + .collect() +} + +/// Extracts sorted resource UUIDs from the trailing 16 bytes of each +/// 32-byte composite key that shares the given prefix. +fn extract_resource_ids(ks: &Keyspace, prefix: &[u8]) -> Result> { + let mut ids = Vec::new(); + for guard in ks.prefix(prefix) { + let key = guard.key().map_err(|err| { + Error::new(ErrorKind::Internal, "failed to iterate keyspace") + .with_component(COMPONENT) + .with_source(err) + })?; + if key.len() == 32 + && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) + { + ids.push(Uuid::from_bytes(bytes)); + } + } + ids.sort(); + Ok(ids) +} + +#[cfg(test)] +mod tests { + use nvisy_core::content::{Content, ContentData}; + use nvisy_ontology::context::Context; + + use super::*; + + /// Opens a temporary registry backed by a fresh [`tempfile::TempDir`]. + fn open_temp_registry() -> (tempfile::TempDir, Registry) { + let temp = tempfile::TempDir::new().unwrap(); + let registry = Registry::open(temp.path().join("data")).unwrap(); + (temp, registry) + } + + #[tokio::test] + async fn register_and_read_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let content = Content::new(ContentData::from("Hello, world!")); + + let handle = registry.register_content(actor_id, content).await.unwrap(); + let data = handle.content_data().await.unwrap(); + assert_eq!(data.as_str().unwrap(), "Hello, world!"); + } + + #[tokio::test] + async fn content_scoped_by_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + let content = Content::new(ContentData::from("actor A only")); + let handle = registry.register_content(actor_a, content).await.unwrap(); + let content_id = handle.content_source().as_uuid(); + + // Actor B cannot see actor A's content. + let err = registry + .read_content(actor_b, content_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + + // Actor A can. + registry.read_content(actor_a, content_id).await.unwrap(); + } + + #[tokio::test] + async fn list_content_per_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + registry + .register_content(actor_a, Content::new(ContentData::from("a1"))) + .await + .unwrap(); + registry + .register_content(actor_a, Content::new(ContentData::from("a2"))) + .await + .unwrap(); + registry + .register_content(actor_b, Content::new(ContentData::from("b1"))) + .await + .unwrap(); + + assert_eq!(registry.list_content(actor_a).await.unwrap().len(), 2); + assert_eq!(registry.list_content(actor_b).await.unwrap().len(), 1); + } + + #[tokio::test] + async fn unregister_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let content = Content::new(ContentData::from("delete me")); + let content_id = content.content_source().as_uuid(); + registry.register_content(actor_id, content).await.unwrap(); + + registry + .unregister_content(actor_id, content_id) + .await + .unwrap(); + + let err = registry + .read_content(actor_id, content_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + } + + #[tokio::test] + async fn unregister_all_content() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_content(actor_id, Content::new(ContentData::from("first"))) + .await + .unwrap(); + registry + .register_content(actor_id, Content::new(ContentData::from("second"))) + .await + .unwrap(); + + let deleted = registry.unregister_all_content(actor_id).await.unwrap(); + assert_eq!(deleted, 2); + assert!(registry.list_content(actor_id).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn register_and_read_context() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let ctx = Context::new("test-context", vec![]); + + let handle = registry + .register_context(actor_id, ctx.clone()) + .await + .unwrap(); + let read_ctx = handle.context().await.unwrap(); + assert_eq!(read_ctx.name, "test-context"); + } + + #[tokio::test] + async fn context_scoped_by_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_a = Uuid::now_v7(); + let actor_b = Uuid::now_v7(); + + let ctx = Context::new("private", vec![]); + let handle = registry.register_context(actor_a, ctx).await.unwrap(); + let context_id = handle.source().as_uuid(); + + let err = registry + .read_context(actor_b, context_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + + registry.read_context(actor_a, context_id).await.unwrap(); + } + + #[tokio::test] + async fn list_contexts_per_actor() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_context(actor_id, Context::new("ctx-1", vec![])) + .await + .unwrap(); + registry + .register_context(actor_id, Context::new("ctx-2", vec![])) + .await + .unwrap(); + + assert_eq!(registry.list_contexts(actor_id).await.unwrap().len(), 2); + } + + #[tokio::test] + async fn unregister_context() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + let ctx = Context::new("remove-me", vec![]); + let context_id = ctx.source.as_uuid(); + + registry.register_context(actor_id, ctx).await.unwrap(); + registry + .unregister_context(actor_id, context_id) + .await + .unwrap(); + + let err = registry + .read_context(actor_id, context_id) + .await + .unwrap_err(); + assert_eq!(err.kind, ErrorKind::NotFound); + } + + #[tokio::test] + async fn unregister_all_contexts() { + let (_temp, registry) = open_temp_registry(); + let actor_id = Uuid::now_v7(); + + registry + .register_context(actor_id, Context::new("c1", vec![])) + .await + .unwrap(); + registry + .register_context(actor_id, Context::new("c2", vec![])) + .await + .unwrap(); + + let deleted = registry.unregister_all_contexts(actor_id).await.unwrap(); + assert_eq!(deleted, 2); + assert!(registry.list_contexts(actor_id).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn data_persists_across_reopen() { + let temp = tempfile::TempDir::new().unwrap(); + let path = temp.path().join("data"); + let actor_id = Uuid::now_v7(); + + let content = Content::new(ContentData::from("persistent")); + let content_id = content.content_source().as_uuid(); + + { + let registry = Registry::open(&path).unwrap(); + registry.register_content(actor_id, content).await.unwrap(); + } + + let registry = Registry::open(&path).unwrap(); + let handle = registry.read_content(actor_id, content_id).await.unwrap(); + let data = handle.content_data().await.unwrap(); + assert_eq!(data.as_str().unwrap(), "persistent"); + } + + #[tokio::test] + async fn base_dir() { + let temp = tempfile::TempDir::new().unwrap(); + let base = temp.path().join("data"); + let registry = Registry::open(&base).unwrap(); + assert_eq!(registry.base_dir(), base); + } +} diff --git a/crates/nvisy-registry/src/store/content.rs b/crates/nvisy-registry/src/store/content.rs deleted file mode 100644 index 9d501496..00000000 --- a/crates/nvisy-registry/src/store/content.rs +++ /dev/null @@ -1,111 +0,0 @@ -use std::fmt; - -use bytes::Bytes; -use fjall::Keyspace; -use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; -use nvisy_core::{Error, ErrorKind, Result}; -use uuid::Uuid; - -/// Lightweight handle to a content entry stored in the registry. -/// -/// Holds references to the fjall keyspaces so it can read content data -/// and metadata on demand. Cloning is cheap because fjall handles are -/// internally `Arc`-wrapped. -#[derive(Clone)] -pub struct ContentHandle { - actor: Uuid, - content_source: ContentSource, - content: Keyspace, - content_meta: Keyspace, -} - -impl fmt::Debug for ContentHandle { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ContentHandle") - .field("actor", &self.actor) - .field("content_source", &self.content_source) - .finish_non_exhaustive() - } -} - -impl ContentHandle { - pub(crate) fn new( - actor: Uuid, - content_source: ContentSource, - content: Keyspace, - content_meta: Keyspace, - ) -> Self { - Self { - actor, - content_source, - content, - content_meta, - } - } - - /// Returns the content source identifier. - pub fn content_source(&self) -> ContentSource { - self.content_source - } - - /// Returns the actor that owns this content. - pub fn actor(&self) -> Uuid { - self.actor - } - - /// Reads the content bytes from the store. - pub async fn content_data(&self) -> Result { - let key = self.composite_key(); - let source = self.content_source; - let content_ks = self.content.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = content_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read content data").with_source(err) - })?; - - let guard = value.ok_or_else(|| { - Error::new( - ErrorKind::NotFound, - format!("Content data not found (id: {})", source.as_uuid()), - ) - })?; - - Ok(ContentData::new(source, Bytes::copy_from_slice(&guard))) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Reads the content metadata from the store. - pub async fn metadata(&self) -> Result { - let key = self.composite_key(); - let meta_ks = self.content_meta.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = meta_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read content metadata").with_source(err) - })?; - - match value { - Some(guard) => serde_json::from_slice(&guard).map_err(|err| { - Error::new( - ErrorKind::Serialization, - "Failed to deserialize content metadata", - ) - .with_source(err) - }), - None => Ok(ContentMetadata::default()), - } - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - fn composite_key(&self) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(self.actor.as_bytes()); - key[16..].copy_from_slice(self.content_source.as_uuid().as_bytes()); - key - } -} diff --git a/crates/nvisy-registry/src/store/context.rs b/crates/nvisy-registry/src/store/context.rs deleted file mode 100644 index 6e690ffd..00000000 --- a/crates/nvisy-registry/src/store/context.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::fmt; - -use fjall::Keyspace; -use nvisy_core::content::ContentSource; -use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::context::Context; -use uuid::Uuid; - -/// Lightweight handle to a context entry stored in the registry. -/// -/// Holds a reference to the contexts keyspace so it can deserialize the -/// stored JSON on demand. Cloning is cheap because fjall handles are -/// internally `Arc`-wrapped. -#[derive(Clone)] -pub struct ContextHandle { - actor: Uuid, - source: ContentSource, - contexts: Keyspace, -} - -impl fmt::Debug for ContextHandle { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ContextHandle") - .field("actor", &self.actor) - .field("source", &self.source) - .finish_non_exhaustive() - } -} - -impl ContextHandle { - pub(crate) fn new(actor: Uuid, source: ContentSource, contexts: Keyspace) -> Self { - Self { - actor, - source, - contexts, - } - } - - /// Returns the content source identifier. - pub fn source(&self) -> ContentSource { - self.source - } - - /// Returns the actor that owns this context. - pub fn actor(&self) -> Uuid { - self.actor - } - - /// Reads and deserializes the context from the store. - pub async fn context(&self) -> Result { - let key = self.composite_key(); - let ctx_ks = self.contexts.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let value = ctx_ks.get(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to read context").with_source(err) - })?; - - let guard = - value.ok_or_else(|| Error::new(ErrorKind::NotFound, "Context data not found"))?; - - serde_json::from_slice(&guard).map_err(|err| { - Error::new(ErrorKind::Serialization, "Failed to deserialize context") - .with_source(err) - }) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - fn composite_key(&self) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(self.actor.as_bytes()); - key[16..].copy_from_slice(self.source.as_uuid().as_bytes()); - key - } -} diff --git a/crates/nvisy-registry/src/store/mod.rs b/crates/nvisy-registry/src/store/mod.rs deleted file mode 100644 index 83c342ef..00000000 --- a/crates/nvisy-registry/src/store/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -mod content; -mod context; -mod registry; - -pub use content::ContentHandle; -pub use context::ContextHandle; -pub use registry::Registry; diff --git a/crates/nvisy-registry/src/store/registry.rs b/crates/nvisy-registry/src/store/registry.rs deleted file mode 100644 index 53a710bb..00000000 --- a/crates/nvisy-registry/src/store/registry.rs +++ /dev/null @@ -1,653 +0,0 @@ -use std::path::{Path, PathBuf}; - -use fjall::{Database, Keyspace, KeyspaceCreateOptions, KvSeparationOptions}; -use nvisy_core::content::{Content, ContentSource}; -use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::context::Context; -use uuid::Uuid; - -use super::content::ContentHandle; -use super::context::ContextHandle; - -/// Builds a 32-byte composite key: `[actor: 16][resource_id: 16]`. -fn make_key(actor: Uuid, id: Uuid) -> [u8; 32] { - let mut key = [0u8; 32]; - key[..16].copy_from_slice(actor.as_bytes()); - key[16..].copy_from_slice(id.as_bytes()); - key -} - -/// Actor-scoped content and context store backed by fjall. -/// -/// Stores content data, content metadata, and contexts in three keyspaces. -/// Every key is a 32-byte composite of `[actor_id][resource_id]`, so all -/// operations are inherently scoped to a single actor. -/// -/// All handles are internally `Arc`-wrapped, making `Registry` cheap to -/// clone and safe to share across threads. -#[derive(Clone)] -pub struct Registry { - base_dir: PathBuf, - db: Database, - content: Keyspace, - content_meta: Keyspace, - contexts: Keyspace, -} - -impl std::fmt::Debug for Registry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Registry") - .field("base_dir", &self.base_dir) - .finish_non_exhaustive() - } -} - -impl Registry { - /// Opens (or creates) the fjall database at `path`. - /// - /// Three keyspaces are created: - /// - `"content"` with blob separation for efficient large-value storage - /// - `"content_meta"` with default configuration - /// - `"contexts"` with default configuration - /// - /// # Errors - /// - /// Returns an error if the database or keyspaces cannot be opened. - pub fn open(path: impl Into) -> Result { - let base_dir = path.into(); - - let db = Database::builder(&base_dir).open().map_err(|err| { - Error::new( - ErrorKind::Internal, - format!( - "Failed to open registry database (path: {})", - base_dir.display() - ), - ) - .with_source(err) - })?; - - let content = db - .keyspace("content", || { - KeyspaceCreateOptions::default() - .with_kv_separation(Some(KvSeparationOptions::default())) - }) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open content keyspace").with_source(err) - })?; - - let content_meta = db - .keyspace("content_meta", KeyspaceCreateOptions::default) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open content_meta keyspace") - .with_source(err) - })?; - - let contexts = db - .keyspace("contexts", KeyspaceCreateOptions::default) - .map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to open contexts keyspace").with_source(err) - })?; - - Ok(Self { - base_dir, - db, - content, - content_meta, - contexts, - }) - } - - /// Registers content, writing its bytes and metadata to the store. - /// - /// Returns a [`ContentHandle`] for subsequent reads. - pub async fn register_content(&self, actor: Uuid, content: Content) -> Result { - let content_source = content.content_source(); - let key = make_key(actor, content_source.as_uuid()); - let data = content.as_bytes().to_vec(); - - let (_, content_metadata) = content.into_parts(); - let meta_bytes = - serde_json::to_vec(&content_metadata.unwrap_or_default()).map_err(|err| { - Error::new( - ErrorKind::Serialization, - "Failed to serialize content metadata", - ) - .with_source(err) - })?; - - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - content_ks.insert(key, &data).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write content data").with_source(err) - })?; - meta_ks.insert(key, &meta_bytes).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write content metadata").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - Ok(ContentHandle::new( - actor, - content_source, - self.content.clone(), - self.content_meta.clone(), - )) - } - - /// Looks up previously registered content by actor and content ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn read_content(&self, actor: Uuid, id: Uuid) -> Result { - let key = make_key(actor, id); - let content_ks = self.content.clone(); - - let exists = tokio::task::spawn_blocking(move || -> Result { - content_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check content key").with_source(err) - }) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Content not found (actor: {actor}, id: {id})"), - )); - } - - let source = ContentSource::from(id); - Ok(ContentHandle::new( - actor, - source, - self.content.clone(), - self.content_meta.clone(), - )) - } - - /// Removes a single content entry by actor and content ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn unregister_content(&self, actor: Uuid, id: Uuid) -> Result<()> { - let key = make_key(actor, id); - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - let exists = content_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check content key").with_source(err) - })?; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Content not found (actor: {actor}, id: {id})"), - )); - } - - content_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content data").with_source(err) - })?; - meta_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content metadata") - .with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Removes all content entries for an actor. - /// - /// Returns the number of entries removed. - pub async fn unregister_all_content(&self, actor: Uuid) -> Result { - let prefix = actor.as_bytes().to_vec(); - let content_ks = self.content.clone(); - let meta_ks = self.content_meta.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let keys: Vec> = content_ks - .prefix(&prefix) - .map(|guard| { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate content keyspace") - .with_source(err) - })?; - Ok(key.to_vec()) - }) - .collect::>>()?; - - let count = keys.len(); - - for key in &keys { - content_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content data") - .with_source(err) - })?; - meta_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove content metadata") - .with_source(err) - })?; - } - - if count > 0 { - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - } - - Ok(count) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Lists all content IDs for an actor. - pub async fn list_content(&self, actor: Uuid) -> Result> { - let prefix = actor.as_bytes().to_vec(); - let content_ks = self.content.clone(); - - tokio::task::spawn_blocking(move || -> Result> { - let mut ids = Vec::new(); - for guard in content_ks.prefix(&prefix) { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate content keyspace") - .with_source(err) - })?; - if key.len() == 32 - && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) - { - ids.push(Uuid::from_bytes(bytes)); - } - } - ids.sort(); - Ok(ids) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Registers a context, serializing it as JSON. - /// - /// Returns a [`ContextHandle`] for subsequent reads. - pub async fn register_context(&self, actor: Uuid, context: Context) -> Result { - let source = context.source; - let key = make_key(actor, source.as_uuid()); - - let json_bytes = serde_json::to_vec(&context).map_err(|err| { - Error::new(ErrorKind::Serialization, "Failed to serialize context").with_source(err) - })?; - - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - ctx_ks.insert(key, &json_bytes).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to write context").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - Ok(ContextHandle::new(actor, source, self.contexts.clone())) - } - - /// Looks up a previously registered context by actor and context ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn read_context(&self, actor: Uuid, id: Uuid) -> Result { - let key = make_key(actor, id); - let ctx_ks = self.contexts.clone(); - - let exists = tokio::task::spawn_blocking(move || -> Result { - ctx_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check context key").with_source(err) - }) - }) - .await - .map_err(|err| { - Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err) - })??; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Context not found (actor: {actor}, id: {id})"), - )); - } - - let source = ContentSource::from(id); - Ok(ContextHandle::new(actor, source, self.contexts.clone())) - } - - /// Removes a single context entry by actor and context ID. - /// - /// Returns [`ErrorKind::NotFound`] if no entry exists for the given key. - pub async fn unregister_context(&self, actor: Uuid, id: Uuid) -> Result<()> { - let key = make_key(actor, id); - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result<()> { - let exists = ctx_ks.contains_key(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to check context key").with_source(err) - })?; - - if !exists { - return Err(Error::new( - ErrorKind::NotFound, - format!("Context not found (actor: {actor}, id: {id})"), - )); - } - - ctx_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove context").with_source(err) - })?; - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - Ok(()) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Removes all context entries for an actor. - /// - /// Returns the number of entries removed. - pub async fn unregister_all_contexts(&self, actor: Uuid) -> Result { - let prefix = actor.as_bytes().to_vec(); - let ctx_ks = self.contexts.clone(); - let db = self.db.clone(); - - tokio::task::spawn_blocking(move || -> Result { - let keys: Vec> = ctx_ks - .prefix(&prefix) - .map(|guard| { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate contexts keyspace") - .with_source(err) - })?; - Ok(key.to_vec()) - }) - .collect::>>()?; - - let count = keys.len(); - - for key in &keys { - ctx_ks.remove(key).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to remove context").with_source(err) - })?; - } - - if count > 0 { - db.persist(fjall::PersistMode::SyncAll).map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to persist database").with_source(err) - })?; - } - - Ok(count) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Lists all context IDs for an actor. - pub async fn list_contexts(&self, actor: Uuid) -> Result> { - let prefix = actor.as_bytes().to_vec(); - let ctx_ks = self.contexts.clone(); - - tokio::task::spawn_blocking(move || -> Result> { - let mut ids = Vec::new(); - for guard in ctx_ks.prefix(&prefix) { - let key = guard.key().map_err(|err| { - Error::new(ErrorKind::Internal, "Failed to iterate contexts keyspace") - .with_source(err) - })?; - if key.len() == 32 - && let Ok(bytes) = <[u8; 16]>::try_from(&key[16..]) - { - ids.push(Uuid::from_bytes(bytes)); - } - } - ids.sort(); - Ok(ids) - }) - .await - .map_err(|err| Error::new(ErrorKind::Internal, "Blocking task panicked").with_source(err))? - } - - /// Returns the base directory path (the database location). - pub fn base_dir(&self) -> &Path { - &self.base_dir - } -} - -#[cfg(test)] -mod tests { - use nvisy_core::content::{Content, ContentData}; - use nvisy_ontology::context::Context; - - use super::*; - - fn open_temp_registry() -> (tempfile::TempDir, Registry) { - let temp = tempfile::TempDir::new().unwrap(); - let registry = Registry::open(temp.path().join("data")).unwrap(); - (temp, registry) - } - - #[tokio::test] - async fn register_and_read_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let content = Content::new(ContentData::from("Hello, world!")); - - let handle = registry.register_content(actor, content).await.unwrap(); - let data = handle.content_data().await.unwrap(); - assert_eq!(data.as_str().unwrap(), "Hello, world!"); - } - - #[tokio::test] - async fn content_scoped_by_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - let content = Content::new(ContentData::from("actor A only")); - let handle = registry.register_content(actor_a, content).await.unwrap(); - let id = handle.content_source().as_uuid(); - - // Actor B cannot see actor A's content - let err = registry.read_content(actor_b, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - - // Actor A can - registry.read_content(actor_a, id).await.unwrap(); - } - - #[tokio::test] - async fn list_content_per_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - registry - .register_content(actor_a, Content::new(ContentData::from("a1"))) - .await - .unwrap(); - registry - .register_content(actor_a, Content::new(ContentData::from("a2"))) - .await - .unwrap(); - registry - .register_content(actor_b, Content::new(ContentData::from("b1"))) - .await - .unwrap(); - - assert_eq!(registry.list_content(actor_a).await.unwrap().len(), 2); - assert_eq!(registry.list_content(actor_b).await.unwrap().len(), 1); - } - - #[tokio::test] - async fn unregister_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let content = Content::new(ContentData::from("delete me")); - let id = content.content_source().as_uuid(); - registry.register_content(actor, content).await.unwrap(); - - registry.unregister_content(actor, id).await.unwrap(); - - let err = registry.read_content(actor, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - } - - #[tokio::test] - async fn unregister_all_content() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_content(actor, Content::new(ContentData::from("first"))) - .await - .unwrap(); - registry - .register_content(actor, Content::new(ContentData::from("second"))) - .await - .unwrap(); - - let deleted = registry.unregister_all_content(actor).await.unwrap(); - assert_eq!(deleted, 2); - assert!(registry.list_content(actor).await.unwrap().is_empty()); - } - - #[tokio::test] - async fn register_and_read_context() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let ctx = Context::new("test-context", vec![]); - - let handle = registry.register_context(actor, ctx.clone()).await.unwrap(); - let read_ctx = handle.context().await.unwrap(); - assert_eq!(read_ctx.name, "test-context"); - } - - #[tokio::test] - async fn context_scoped_by_actor() { - let (_temp, registry) = open_temp_registry(); - let actor_a = Uuid::now_v7(); - let actor_b = Uuid::now_v7(); - - let ctx = Context::new("private", vec![]); - let handle = registry.register_context(actor_a, ctx).await.unwrap(); - let id = handle.source().as_uuid(); - - let err = registry.read_context(actor_b, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - - registry.read_context(actor_a, id).await.unwrap(); - } - - #[tokio::test] - async fn list_contexts_per_actor() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_context(actor, Context::new("ctx-1", vec![])) - .await - .unwrap(); - registry - .register_context(actor, Context::new("ctx-2", vec![])) - .await - .unwrap(); - - assert_eq!(registry.list_contexts(actor).await.unwrap().len(), 2); - } - - #[tokio::test] - async fn unregister_context() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - let ctx = Context::new("remove-me", vec![]); - let id = ctx.source.as_uuid(); - - registry.register_context(actor, ctx).await.unwrap(); - registry.unregister_context(actor, id).await.unwrap(); - - let err = registry.read_context(actor, id).await.unwrap_err(); - assert_eq!(err.kind, ErrorKind::NotFound); - } - - #[tokio::test] - async fn unregister_all_contexts() { - let (_temp, registry) = open_temp_registry(); - let actor = Uuid::now_v7(); - - registry - .register_context(actor, Context::new("c1", vec![])) - .await - .unwrap(); - registry - .register_context(actor, Context::new("c2", vec![])) - .await - .unwrap(); - - let deleted = registry.unregister_all_contexts(actor).await.unwrap(); - assert_eq!(deleted, 2); - assert!(registry.list_contexts(actor).await.unwrap().is_empty()); - } - - #[tokio::test] - async fn data_persists_across_reopen() { - let temp = tempfile::TempDir::new().unwrap(); - let path = temp.path().join("data"); - let actor = Uuid::now_v7(); - - let content = Content::new(ContentData::from("persistent")); - let id = content.content_source().as_uuid(); - - { - let registry = Registry::open(&path).unwrap(); - registry.register_content(actor, content).await.unwrap(); - } - - let registry = Registry::open(&path).unwrap(); - let handle = registry.read_content(actor, id).await.unwrap(); - let data = handle.content_data().await.unwrap(); - assert_eq!(data.as_str().unwrap(), "persistent"); - } - - #[tokio::test] - async fn base_dir() { - let temp = tempfile::TempDir::new().unwrap(); - let base = temp.path().join("data"); - let registry = Registry::open(&base).unwrap(); - assert_eq!(registry.base_dir(), base); - } -} diff --git a/crates/nvisy-rig/src/agent/base/mod.rs b/crates/nvisy-rig/src/agent/base/mod.rs index 00c1d811..97fba788 100644 --- a/crates/nvisy-rig/src/agent/base/mod.rs +++ b/crates/nvisy-rig/src/agent/base/mod.rs @@ -7,11 +7,11 @@ mod detection; mod provider; mod response; -pub use agent::AgentConfig; -pub(crate) use agent::{Agents, BaseAgent}; -pub(crate) use builder::BaseAgentBuilder; -pub use context::ContextWindow; -pub(crate) use detection::ALL_TYPES_HINT; -pub use detection::{DetectionConfig, DetectionRequest, DetectionResponse}; -pub use provider::AgentProvider; -pub(crate) use response::ResponseParser; +pub use self::agent::AgentConfig; +pub(crate) use self::agent::{Agents, BaseAgent}; +pub(crate) use self::builder::BaseAgentBuilder; +pub use self::context::ContextWindow; +pub(crate) use self::detection::ALL_TYPES_HINT; +pub use self::detection::{DetectionConfig, DetectionRequest, DetectionResponse}; +pub use self::provider::AgentProvider; +pub(crate) use self::response::ResponseParser; diff --git a/crates/nvisy-rig/src/agent/base/response.rs b/crates/nvisy-rig/src/agent/base/response.rs index a37ff3b7..ddb12a22 100644 --- a/crates/nvisy-rig/src/agent/base/response.rs +++ b/crates/nvisy-rig/src/agent/base/response.rs @@ -114,7 +114,7 @@ mod tests { #[test] fn parse_json_raw_array() { - let text = r#"[{"category":"pii","entity_type":"email_address","value":"a@b.com","confidence":0.9,"start_offset":0,"end_offset":7}]"#; + let text = r#"[{"category":"contact_info","entity_type":"email_address","value":"a@b.com","confidence":0.9,"start_offset":0,"end_offset":7}]"#; let result = ResponseParser::from_text(text) .parse_json::>() .unwrap(); @@ -123,7 +123,7 @@ mod tests { #[test] fn parse_json_fenced() { - let text = "```json\n[{\"category\":\"pii\",\"entity_type\":\"email_address\",\"value\":\"a@b.com\",\"confidence\":0.9}]\n```"; + let text = "```json\n[{\"category\":\"contact_info\",\"entity_type\":\"email_address\",\"value\":\"a@b.com\",\"confidence\":0.9}]\n```"; let result = ResponseParser::from_text(text) .parse_json::>() .unwrap(); diff --git a/crates/nvisy-rig/src/agent/cv/mod.rs b/crates/nvisy-rig/src/agent/cv/mod.rs index 6bb0199e..30cb2ec2 100644 --- a/crates/nvisy-rig/src/agent/cv/mod.rs +++ b/crates/nvisy-rig/src/agent/cv/mod.rs @@ -12,12 +12,12 @@ mod tool; use async_trait::async_trait; use base64::Engine; use base64::engine::general_purpose::STANDARD; -pub use output::{CvEntities, CvEntity}; -use prompt::{CV_SYSTEM_PROMPT, CvPromptBuilder}; use serde::Serialize; -use tool::CvRigTool; use uuid::Uuid; +pub use self::output::{CvEntities, CvEntity}; +use self::prompt::{CV_SYSTEM_PROMPT, CvPromptBuilder}; +use self::tool::CvRigTool; use super::{AgentConfig, AgentProvider, BaseAgent, DetectionConfig}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/cv/prompt.rs b/crates/nvisy-rig/src/agent/cv/prompt.rs index 513c31f1..eade555e 100644 --- a/crates/nvisy-rig/src/agent/cv/prompt.rs +++ b/crates/nvisy-rig/src/agent/cv/prompt.rs @@ -52,15 +52,14 @@ You have access to a computer vision tool that detects faces, license plates, an \n\ Your workflow:\n\ 1. Use the cv_detect_objects tool to detect objects in the provided image.\n\ -2. Analyze the detections and classify each into an entity category (pii, phi, etc.) \ - and specific entity type.\n\ +2. Analyze the detections and classify each into an entity category and specific entity type.\n\ 3. Return a JSON array of detected entities, each with keys: \ category, entity_type, label, confidence, bbox ([x, y, width, height] in pixels).\n\ \n\ Common entity mappings:\n\ -- face → category: pii, entity_type: biometric_data\n\ -- license_plate → category: pii, entity_type: vehicle_id\n\ -- signature → category: pii, entity_type: biometric_data\n\ -- handwriting → category: pii, entity_type: person_name (if it contains a name)\n\ +- face → category: biometric, entity_type: face\n\ +- license_plate → category: personal_identity, entity_type: vehicle_registration\n\ +- signature → category: biometric, entity_type: signature\n\ +- handwriting → category: personal_identity, entity_type: person_name (if it contains a name)\n\ \n\ If no objects are detected, return an empty array []."; diff --git a/crates/nvisy-rig/src/agent/generate/mod.rs b/crates/nvisy-rig/src/agent/generate/mod.rs index 1c22a3aa..60636871 100644 --- a/crates/nvisy-rig/src/agent/generate/mod.rs +++ b/crates/nvisy-rig/src/agent/generate/mod.rs @@ -8,10 +8,10 @@ mod output; mod prompt; use nvisy_ontology::entity::EntityKind; -pub use output::{GenOutput, GeneratedEntity}; -use prompt::{GEN_SYSTEM_PROMPT, GenPromptBuilder}; use uuid::Uuid; +pub use self::output::{GenOutput, GeneratedEntity}; +use self::prompt::{GEN_SYSTEM_PROMPT, GenPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/mod.rs b/crates/nvisy-rig/src/agent/mod.rs index b90da594..ad04d73c 100644 --- a/crates/nvisy-rig/src/agent/mod.rs +++ b/crates/nvisy-rig/src/agent/mod.rs @@ -11,11 +11,15 @@ mod generate; mod ner; mod ocr; -pub(crate) use base::{ALL_TYPES_HINT, BaseAgent}; -pub use base::{ +pub(crate) use self::base::{ALL_TYPES_HINT, BaseAgent}; +pub use self::base::{ AgentConfig, AgentProvider, ContextWindow, DetectionConfig, DetectionRequest, DetectionResponse, }; -pub use cv::{CvAgent, CvDetection, CvEntities, CvEntity, CvProvider}; -pub use generate::{GenAgent, GenOutput, GenRequest, GeneratedEntity}; -pub use ner::{KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets}; -pub use ocr::{OcrAgent, ProposedEntity, VerificationOutput, VerificationStatus, VerifiedEntity}; +pub use self::cv::{CvAgent, CvDetection, CvEntities, CvEntity, CvProvider}; +pub use self::generate::{GenAgent, GenOutput, GenRequest, GeneratedEntity}; +pub use self::ner::{ + KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets, +}; +pub use self::ocr::{ + OcrAgent, ProposedEntity, VerificationOutput, VerificationStatus, VerifiedEntity, +}; diff --git a/crates/nvisy-rig/src/agent/ner/mod.rs b/crates/nvisy-rig/src/agent/ner/mod.rs index a1627b63..45dab21c 100644 --- a/crates/nvisy-rig/src/agent/ner/mod.rs +++ b/crates/nvisy-rig/src/agent/ner/mod.rs @@ -8,12 +8,12 @@ mod context; mod output; mod prompt; -pub use context::NerContext; use nvisy_http::HttpClient; -pub use output::{KnownNerEntity, NerEntities, NerEntity, ResolvedOffsets}; -use prompt::{NER_SYSTEM_PROMPT, NerPromptBuilder}; use uuid::Uuid; +pub use self::context::NerContext; +pub use self::output::{KnownNerEntity, NerEntities, NerEntity, ResolvedOffsets}; +use self::prompt::{NER_SYSTEM_PROMPT, NerPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent, DetectionConfig}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/ocr/input.rs b/crates/nvisy-rig/src/agent/ocr/input.rs index 89b2c19d..9308ce1f 100644 --- a/crates/nvisy-rig/src/agent/ocr/input.rs +++ b/crates/nvisy-rig/src/agent/ocr/input.rs @@ -31,7 +31,7 @@ impl ProposedEntity { }; Self { id, - category: entity.category.clone(), + category: entity.category, entity_type: entity.entity_kind, value: entity.value.clone(), confidence: entity.confidence, diff --git a/crates/nvisy-rig/src/agent/ocr/mod.rs b/crates/nvisy-rig/src/agent/ocr/mod.rs index 9078c309..6f46a8bf 100644 --- a/crates/nvisy-rig/src/agent/ocr/mod.rs +++ b/crates/nvisy-rig/src/agent/ocr/mod.rs @@ -11,12 +11,12 @@ mod prompt; use base64::Engine; use base64::engine::general_purpose::STANDARD; -pub use input::ProposedEntity; use nvisy_ontology::entity::Entity; -pub use output::{VerificationOutput, VerificationStatus, VerifiedEntity}; -use prompt::{OCR_SYSTEM_PROMPT, OcrPromptBuilder}; use uuid::Uuid; +pub use self::input::ProposedEntity; +pub use self::output::{VerificationOutput, VerificationStatus, VerifiedEntity}; +use self::prompt::{OCR_SYSTEM_PROMPT, OcrPromptBuilder}; use super::{AgentConfig, AgentProvider, BaseAgent}; use crate::backend::UsageTracker; use crate::error::Error; diff --git a/crates/nvisy-rig/src/agent/ocr/output.rs b/crates/nvisy-rig/src/agent/ocr/output.rs index 3e754e9d..c70d27e8 100644 --- a/crates/nvisy-rig/src/agent/ocr/output.rs +++ b/crates/nvisy-rig/src/agent/ocr/output.rs @@ -3,7 +3,10 @@ use std::collections::HashMap; use nvisy_core::math::BoundingBox; -use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityKind, ImageLocation}; +use nvisy_ontology::entity::{ + Entity, EntityCategory, EntityKind, ExtractionMethod, ImageLocation, RecognitionMethod, + RefinementMethod, +}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -58,9 +61,13 @@ impl VerifiedEntity { self.category.unwrap_or(entity.category), self.entity_type.unwrap_or(entity.entity_kind), self.value.as_deref().unwrap_or(&entity.value), - DetectionMethod::Ocr, + RecognitionMethod::Ner, self.confidence, ); + corrected.extraction_methods = vec![ExtractionMethod::OpticalCharacterRecognition]; + corrected + .refinement_methods + .push(RefinementMethod::ModelVerification); corrected.source = entity.source; if let Some(bbox) = self.bbox { diff --git a/crates/nvisy-rig/src/agent/ocr/prompt.rs b/crates/nvisy-rig/src/agent/ocr/prompt.rs index 75d3b2fe..b61581cb 100644 --- a/crates/nvisy-rig/src/agent/ocr/prompt.rs +++ b/crates/nvisy-rig/src/agent/ocr/prompt.rs @@ -78,7 +78,7 @@ mod tests { let entities = vec![ ProposedEntity { id: 0, - category: EntityCategory::Pii, + category: EntityCategory::PersonalIdentity, entity_type: EntityKind::PersonName, value: "John Doe".into(), confidence: 0.95, @@ -100,7 +100,7 @@ mod tests { ]; let prompt = OcrPromptBuilder::new(&entities).build("AAAA"); - assert!(prompt.contains("[0] category=pii")); + assert!(prompt.contains("[0] category=personal_identity")); assert!(prompt.contains("person_name")); assert!(prompt.contains("John Doe")); assert!(prompt.contains("bbox=[10.0, 20.0, 100.0, 30.0]")); diff --git a/crates/nvisy-rig/src/audio/mod.rs b/crates/nvisy-rig/src/audio/mod.rs index d76f5cf4..c18f088d 100644 --- a/crates/nvisy-rig/src/audio/mod.rs +++ b/crates/nvisy-rig/src/audio/mod.rs @@ -3,5 +3,5 @@ pub mod stt; pub mod tts; -pub use stt::SttProvider; -pub use tts::TtsProvider; +pub use self::stt::SttProvider; +pub use self::tts::TtsProvider; diff --git a/crates/nvisy-rig/src/audio/stt/mod.rs b/crates/nvisy-rig/src/audio/stt/mod.rs index 1995864b..4c7d3f46 100644 --- a/crates/nvisy-rig/src/audio/stt/mod.rs +++ b/crates/nvisy-rig/src/audio/stt/mod.rs @@ -7,12 +7,12 @@ mod provider; use nvisy_http::HttpClient; -pub(crate) use provider::SttModels; -pub use provider::SttProvider; #[cfg(feature = "openai-whisper")] use rig::transcription::TranscriptionModel; use uuid::Uuid; +pub(crate) use self::provider::SttModels; +pub use self::provider::SttProvider; use crate::error::Error; const TARGET: &str = "nvisy_rig::stt"; diff --git a/crates/nvisy-rig/src/audio/tts/mod.rs b/crates/nvisy-rig/src/audio/tts/mod.rs index d3e58782..da84a800 100644 --- a/crates/nvisy-rig/src/audio/tts/mod.rs +++ b/crates/nvisy-rig/src/audio/tts/mod.rs @@ -3,12 +3,12 @@ mod provider; use nvisy_http::HttpClient; -pub(crate) use provider::TtsModels; -pub use provider::TtsProvider; #[cfg(feature = "openai-tts")] use rig::audio_generation::AudioGenerationModel as _; use uuid::Uuid; +pub(crate) use self::provider::TtsModels; +pub use self::provider::TtsProvider; use crate::error::Error; const TARGET: &str = "nvisy_rig::tts"; diff --git a/crates/nvisy-rig/src/backend/mod.rs b/crates/nvisy-rig/src/backend/mod.rs index d65dee0d..525d27ff 100644 --- a/crates/nvisy-rig/src/backend/mod.rs +++ b/crates/nvisy-rig/src/backend/mod.rs @@ -2,5 +2,5 @@ mod metrics; mod provider; -pub use metrics::{UsageStats, UsageTracker}; -pub use provider::{AuthenticatedProvider, UnauthenticatedProvider}; +pub use self::metrics::{UsageStats, UsageTracker}; +pub use self::provider::{AuthenticatedProvider, UnauthenticatedProvider}; diff --git a/crates/nvisy-rig/src/backend/provider/mod.rs b/crates/nvisy-rig/src/backend/provider/mod.rs index 3d892313..beaff0ed 100644 --- a/crates/nvisy-rig/src/backend/provider/mod.rs +++ b/crates/nvisy-rig/src/backend/provider/mod.rs @@ -6,5 +6,5 @@ mod authenticated; mod unauthenticated; -pub use authenticated::AuthenticatedProvider; -pub use unauthenticated::UnauthenticatedProvider; +pub use self::authenticated::AuthenticatedProvider; +pub use self::unauthenticated::UnauthenticatedProvider; diff --git a/crates/nvisy-server/src/extract/mod.rs b/crates/nvisy-server/src/extract/mod.rs index 7f8faeae..bc696e5a 100644 --- a/crates/nvisy-server/src/extract/mod.rs +++ b/crates/nvisy-server/src/extract/mod.rs @@ -4,6 +4,6 @@ mod json; mod path; mod version; -pub use json::Json; -pub use path::Path; -pub use version::Version; +pub use self::json::Json; +pub use self::path::Path; +pub use self::version::Version; diff --git a/crates/nvisy-server/src/handler/error/from_core.rs b/crates/nvisy-server/src/handler/error/from_core.rs index 68644d30..1bacfef1 100644 --- a/crates/nvisy-server/src/handler/error/from_core.rs +++ b/crates/nvisy-server/src/handler/error/from_core.rs @@ -19,7 +19,7 @@ impl From for Error<'static> { }; let mut error = Self::new(kind).with_message(err.message); - if let Some(component) = err.source_component { + if let Some(component) = err.component { error = error.with_context(component); } error diff --git a/crates/nvisy-server/src/handler/error/mod.rs b/crates/nvisy-server/src/handler/error/mod.rs index 7d733be7..0687adaf 100644 --- a/crates/nvisy-server/src/handler/error/mod.rs +++ b/crates/nvisy-server/src/handler/error/mod.rs @@ -8,5 +8,5 @@ mod from_core; mod http_error; mod http_kind; -pub use http_error::{Error, Result}; -pub use http_kind::ErrorKind; +pub use self::http_error::{Error, Result}; +pub use self::http_kind::ErrorKind; diff --git a/crates/nvisy-server/src/handler/mod.rs b/crates/nvisy-server/src/handler/mod.rs index 8cb5cce0..2a7cc769 100644 --- a/crates/nvisy-server/src/handler/mod.rs +++ b/crates/nvisy-server/src/handler/mod.rs @@ -21,8 +21,8 @@ mod request; mod response; use aide::axum::ApiRouter; -pub use error::{Error, ErrorKind, Result}; +pub use self::error::{Error, ErrorKind, Result}; use crate::service::ServiceState; /// Build the handler route tree. diff --git a/crates/nvisy-server/src/handler/request/mod.rs b/crates/nvisy-server/src/handler/request/mod.rs index 9a60105e..d01612b2 100644 --- a/crates/nvisy-server/src/handler/request/mod.rs +++ b/crates/nvisy-server/src/handler/request/mod.rs @@ -9,7 +9,7 @@ mod files; mod path; mod process; -pub use contexts::NewContext; -pub use files::NewFile; -pub use path::{ActorQuery, ContentPath, ContextPath}; -pub use process::NewProcess; +pub use self::contexts::NewContext; +pub use self::files::NewFile; +pub use self::path::{ActorQuery, ContentPath, ContextPath}; +pub use self::process::NewProcess; diff --git a/crates/nvisy-server/src/handler/response/mod.rs b/crates/nvisy-server/src/handler/response/mod.rs index b2f7fa9c..6c57c58b 100644 --- a/crates/nvisy-server/src/handler/response/mod.rs +++ b/crates/nvisy-server/src/handler/response/mod.rs @@ -11,8 +11,8 @@ mod error; mod files; mod process; -pub use check::{Analytics, Health, ServiceStatus}; -pub use contexts::{Context, ContextId, ContextList}; -pub use error::ErrorResponse; -pub use files::{File, FileId, FileList}; -pub use process::ProcessResult; +pub use self::check::{Analytics, Health, ServiceStatus}; +pub use self::contexts::{Context, ContextId, ContextList}; +pub use self::error::ErrorResponse; +pub use self::files::{File, FileId, FileList}; +pub use self::process::ProcessResult; diff --git a/crates/nvisy-server/src/lib.rs b/crates/nvisy-server/src/lib.rs index 8c92dde0..8d6a4241 100644 --- a/crates/nvisy-server/src/lib.rs +++ b/crates/nvisy-server/src/lib.rs @@ -7,6 +7,6 @@ pub mod handler; pub mod middleware; pub mod service; -pub use handler::error::{Error, ErrorKind, Result}; -pub use handler::routes; -pub use service::ServiceState; +pub use self::handler::error::{Error, ErrorKind, Result}; +pub use self::handler::routes; +pub use self::service::ServiceState; diff --git a/crates/nvisy-server/src/middleware/mod.rs b/crates/nvisy-server/src/middleware/mod.rs index c4f1e18c..eb25d59c 100644 --- a/crates/nvisy-server/src/middleware/mod.rs +++ b/crates/nvisy-server/src/middleware/mod.rs @@ -47,10 +47,10 @@ mod recovery; mod security; mod specification; -pub use constants::{ +pub use self::constants::{ DEFAULT_MAX_BODY_SIZE, DEFAULT_MAX_FILE_BODY_SIZE, DEFAULT_REQUEST_TIMEOUT_SECS, }; -pub use observability::RouterObservabilityExt; -pub use recovery::{RecoveryConfig, RouterRecoveryExt}; -pub use security::{RouterSecurityExt, SecurityConfig}; -pub use specification::{OpenApiConfig, RouterOpenApiExt}; +pub use self::observability::RouterObservabilityExt; +pub use self::recovery::{RecoveryConfig, RouterRecoveryExt}; +pub use self::security::{RouterSecurityExt, SecurityConfig}; +pub use self::specification::{OpenApiConfig, RouterOpenApiExt};