Pringled · Pringled · Mar 4, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 3, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -40,9 +40,7 @@ jobs:
 
       # Run tests with coverage
       - name: Run tests under coverage
-        run: |
-          coverage run --source=pyversity -m pytest
-          coverage report
+        run: make test
 
       # Upload results to Codecov
       - name: Upload results to Codecov

diff --git a/Makefile b/Makefile
@@ -3,10 +3,10 @@ install:
 	uv run pre-commit install
 
 install-no-pre-commit:
-	uv pip install ".[dev]"
+	uv pip install ".[dev,benchmarks]"
 
 fix:
 	uv run pre-commit run --all-files
 
 test:
-	uv run pytest --cov=PACKAGE --cov-report=term-missing
+	uv run pytest
diff --git a/README.md b/README.md
@@ -107,6 +107,8 @@ This improves exploration, user satisfaction, and coverage across many domains:
 
 MMR and DPP work well as general-purpose diversifiers. In product search, use them to avoid showing near-duplicate results:
 
+<!-- pytestfixture: item_embeddings -->
+<!-- pytestfixture: item_scores -->
 ```python
 from pyversity import diversify, Strategy
 
@@ -128,6 +130,8 @@ result = diversify(
 
 COVER ensures the selected items collectively represent the full topic space. For academic papers, this means covering different subfields and methodologies:
 
+<!-- pytestfixture: paper_embeddings -->
+<!-- pytestfixture: paper_scores -->
 ```python
 from pyversity import diversify, Strategy
 
@@ -151,6 +155,9 @@ result = diversify(
 
 In conversational RAG, you want to avoid feeding the model redundant chunks. SSD diversifies relative to recent context, making it a natural fit:
 
+<!-- pytestfixture: chunk_embeddings -->
+<!-- pytestfixture: chunk_scores -->
+<!-- pytestfixture: recent_chunk_embeddings -->
 ```python
 import numpy as np
 from pyversity import diversify, Strategy
@@ -179,6 +186,9 @@ recent_chunk_embeddings = np.vstack([recent_chunk_embeddings, chunk_embeddings[r
 
 In content feeds, users consume items sequentially. SSD introduces novelty relative to recently shown items, keeping the experience fresh:
 
+<!-- pytestfixture: feed_embeddings -->
+<!-- pytestfixture: feed_scores -->
+<!-- pytestfixture: recent_feed_embeddings -->
 ```python
 import numpy as np
 from pyversity import diversify, Strategy
@@ -207,6 +217,8 @@ recent_feed_embeddings = np.vstack([recent_feed_embeddings, feed_embeddings[resu
 
 When extracting from a long document, you want sections that cover different parts. MSD prefers items that are far apart from each other:
 
+<!-- pytestfixture: doc_chunk_embeddings -->
+<!-- pytestfixture: doc_chunk_scores -->
 ```python
 from pyversity import diversify, Strategy
 

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -284,15 +284,17 @@ python -m benchmarks report
 <details>
 <summary>Programmatic API</summary>
 
+<!-- pytestfixture: benchmark_data -->
 ```python
 from benchmarks import BenchmarkConfig, run_benchmark
 from pyversity import Strategy
 
 config = BenchmarkConfig(
-    dataset_path="benchmarks/data/ml-32m",
+    dataset="ml-32m",
     sample_users=2000,
     strategies=[Strategy.MMR, Strategy.DPP, Strategy.MSD, Strategy.SSD],
     diversity_values=[0.0, 0.3, 0.5, 0.7, 1.0],
+    output_dir=benchmark_data,
 )
 results = run_benchmark(config)
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -41,7 +41,8 @@ dev = [
     "mypy",
     "pre-commit",
     "pytest",
-    "pytest-coverage",
+    "pytest-codeblock",
+    "pytest-cov",
     "ruff",
     "types-requests",
 ]
@@ -114,3 +115,34 @@ pyversity = ["py.typed"]
 
 [tool.setuptools.dynamic]
 version = {attr = "pyversity.version.__version__"}
+
+[tool.pytest.ini_options]
+addopts = [
+    "-ra",
+    "-vvv",
+    "-q",
+    "-p", "tests.docs_conftest",
+    "--cov=pyversity",
+    "--cov=benchmarks",
+    "--cov-report=html",
+    "--cov-report=term-missing",
+    "--cov-append",
+    "--capture=no",
+]
+pythonpath = ["src", "."]
+norecursedirs = [".git"]
+
+[tool.coverage.run]
+relative_files = true
+omit = ["benchmarks/*"]
+source = ["pyversity"]
+
+[tool.coverage.report]
+show_missing = true
+exclude_lines = [
+    "pragma: no cover",
+    "@overload",
+]
+
+[tool.pytest-codeblock]
+test_nameless_codeblocks = true
diff --git a/tests/docs_conftest.py b/tests/docs_conftest.py
@@ -0,0 +1,148 @@
+"""
+Fixtures for testing the README examples.
+
+DO NOT USE ANY OF THESE IN OTHER TESTS!
+"""
+
+from pathlib import Path
+from typing import Generator
+
+import numpy as np
+import pandas as pd
+import pytest
+
+_rng = np.random.default_rng(0)
+
+_N = 20  # number of candidate items
+_D = 16  # embedding dimension
+_R = 5  # number of recent items
+
+
+def _emb(n: int = _N) -> np.ndarray:
+    return _rng.standard_normal((n, _D)).astype(np.float64)
+
+
+def _scores(n: int = _N) -> np.ndarray:
+    raw = _rng.random(n).astype(np.float64)
+    return raw / raw.sum()
+
+
+# --- benchmarks/README.md: Programmatic API ---
+
+
+@pytest.fixture(scope="session")
+def benchmark_data(tmp_path_factory: pytest.TempPathFactory) -> Generator[Path, None, None]:
+    """
+    Tiny synthetic MovieLens-format dataset for testing the benchmark API.
+
+    Patches DATASET_REGISTRY["ml-32m"] to point at a temp directory containing
+    a minimal ratings.csv, and returns a temp output directory to use as
+    BenchmarkConfig.output_dir so the test never writes to the real results tree.
+    """
+    from benchmarks.core.data import DATASET_REGISTRY
+
+    # Create fake ratings.csv: 100 users × 20 items each, rating=4.5 (above 4.0 threshold)
+    rng = np.random.default_rng(0)
+    n_users, n_items, per_user = 100, 50, 20
+    user_ids = np.repeat(np.arange(1, n_users + 1), per_user)
+    item_ids = np.concatenate([rng.choice(n_items, size=per_user, replace=False) + 1 for _ in range(n_users)])
+    df = pd.DataFrame({"userId": user_ids, "movieId": item_ids, "rating": 4.5})
+
+    data_dir: Path = tmp_path_factory.mktemp("ml-32m")
+    df.to_csv(data_dir / "ratings.csv", index=False)
+
+    out_dir: Path = tmp_path_factory.mktemp("benchmark_results")
+
+    original_path = DATASET_REGISTRY["ml-32m"].path
+    DATASET_REGISTRY["ml-32m"].path = str(data_dir)
+
+    yield out_dir
+
+    DATASET_REGISTRY["ml-32m"].path = original_path
+
+
+# --- Product / Web Search (test_README_2) ---
+
+
+@pytest.fixture
+def item_embeddings() -> np.ndarray:
+    """Item embeddings for testing the product/web search example."""
+    return _emb()
+
+
+@pytest.fixture
+def item_scores() -> np.ndarray:
+    """Item scores for testing the product/web search example."""
+    return _scores()
+
+
+# --- Literature Search (test_README_3) ---
+
+
+@pytest.fixture
+def paper_embeddings() -> np.ndarray:
+    """Paper embeddings for testing the literature search example."""
+    return _emb()
+
+
+@pytest.fixture
+def paper_scores() -> np.ndarray:
+    """Paper scores for testing the literature search example."""
+    return _scores()
+
+
+# --- Conversational RAG (test_README_4) ---
+
+
+@pytest.fixture
+def chunk_embeddings() -> np.ndarray:
+    """Chunk embeddings for testing the conversational RAG example."""
+    return _emb()
+
+
+@pytest.fixture
+def chunk_scores() -> np.ndarray:
+    """Chunk scores for testing the conversational RAG example."""
+    return _scores()
+
+
+@pytest.fixture
+def recent_chunk_embeddings() -> np.ndarray:
+    """Recent chunk embeddings for testing the conversational RAG example."""
+    return _emb(_R)
+
+
+# --- Infinite Scroll / Recommendation Feed (test_README_5) ---
+
+
+@pytest.fixture
+def feed_embeddings() -> np.ndarray:
+    """Feed item embeddings for testing the infinite scroll / recommendation feed example."""
+    return _emb()
+
+
+@pytest.fixture
+def feed_scores() -> np.ndarray:
+    """Feed item scores for testing the infinite scroll / recommendation feed example."""
+    return _scores()
+
+
+@pytest.fixture
+def recent_feed_embeddings() -> np.ndarray:
+    """Recent feed item embeddings for testing the infinite scroll / recommendation feed example."""
+    return _emb(_R)
+
+
+# --- Single Long Document (test_README_6) ---
+
+
+@pytest.fixture
+def doc_chunk_embeddings() -> np.ndarray:
+    """Document chunk embeddings for testing the single long document example."""
+    return _emb()
+
+
+@pytest.fixture
+def doc_chunk_scores() -> np.ndarray:
+    """Document chunk scores for testing the single long document example."""
+    return _scores()
diff --git a/uv.lock b/uv.lock