dfetch-org · spoorcc · Mar 6, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -18,20 +18,19 @@ jobs:
         with:
           python-version: '3.12'
 
-      - name: Install dependencies
+      - name: Create & Activate virtualenv and install dependencies
         run: |
+          python -m venv .venv
+          source .venv/bin/activate
+          pip install --upgrade pip
           pip install .[development]
 
-      - run: isort --diff dfetch_hub                                                      # Checks import order
-      - run: black --check dfetch_hub                                                     # Checks code style
-      - run: ruff check dfetch_hub                                                        # Fast linter (bugbear, comprehensions, naming, …)
-      - run: pylint dfetch_hub                                                            # Deep static analysis
-      - run: mypy --strict dfetch_hub                                                     # mypy type check (strict)
-      - run: pyright dfetch_hub                                                           # pyright type check (standard)
-      - run: bandit -c pyproject.toml -r dfetch_hub                                      # Security linting
-      - run: xenon --max-absolute C --max-modules B --max-average A dfetch_hub           # Cyclomatic complexity
-      - run: pyroma --directory --min 9 .                                                 # Package metadata quality
-      - run: djlint --lint dfetch_hub                                                     # HTML linting
-      - run: pydocstyle --convention=google dfetch_hub                                    # Docstring style (Google convention)
-      - run: doc8 --extension .md --max-line-length 120 README.md                          # Documentation prose style
-      - run: pytest --cov=dfetch_hub test                                                 # Run tests with coverage
+      - name: Run pre-commit
+        run: |
+          source .venv/bin/activate
+          pre-commit run --all-files
+
+      - name: Run tests
+        run: |
+          source .venv/bin/activate
+          pytest
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ doc/_build
 doc/landing-page/_build
 example/Tests/
 venv*
+.venv
 public
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,7 +45,7 @@ repos:
       args: [dfetch_hub]
   -   id: xenon
       name: cyclomatic complexity
-      entry: xenon --max-absolute C --max-modules B --max-average A
+      entry: xenon --max-absolute B --max-modules A --max-average A
       language: python
       files: ^dfetch_hub/
       types: [file, python]
@@ -73,3 +73,10 @@ repos:
       entry: doc8 --max-line-length 120 --extension .md
       language: python
       types_or: [rst, markdown]
+  -   id: codespell
+      name: codespell
+      description: Checks for common misspellings in text files.
+      entry: codespell --toml pyproject.toml
+      language: python
+      types: [text]
+      exclude: ^dfetch_hub/data/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -19,10 +19,10 @@
 ## Data model
 
 - Use `@dataclass` for all structured data.
-- Mutable defaults must use `field(default_factory=...)`.
-- Shared structural interface: `ComponentManifest` Protocol in `updater.py` — new manifest types must satisfy it via
-  duck-typing (no explicit inheritance needed). Optional extra fields (`topics`, `readme_content`, …) are accessed
-  with `getattr(manifest, "field", default)` in the updater.
+- Mutable defaults must use `field(default_factory=list)` / `field(default_factory=dict)` — **never** `lambda: []` or
+  `lambda: {}` (ruff PIE807).
+- New manifest types inherit from `BaseManifest` (in `catalog/sources/__init__.py`). Optional extra fields (`topics`,
+  `readme_content`, …) are accessed with `getattr(manifest, "field", default)` in the writer.
 
 ## Module structure
 
@@ -51,9 +51,19 @@
 - Use `logger = get_logger(__name__)` from `dfetch.log`.
 - Raise `RuntimeError` only for infrastructure failures (e.g. missing fetch output).
 
+## Package resources
+
+- **Never** use `Path(__file__)` to locate bundled files. Use `importlib.resources` instead:
+  ```python
+  import importlib.resources
+  _DEFAULT_DATA_DIR: Path = Path(str(importlib.resources.files("dfetch_hub") / "data"))
+  ```
+- The `str()` + `Path()` conversion materialises the `Traversable` into a real filesystem path.
+  This is safe because setuptools installs packages as directories, not zips.
+
 ## Catalog pipeline conventions
 
-- `fetch_source()` (fetcher.py) uses the dfetch Python API — no `subprocess`.
+- `clone_source()` (catalog/cloner.py) uses the dfetch Python API — no `subprocess`.
 - External HTTP calls use stdlib `urllib.request` only — no `requests` dependency.
 - GitHub org/repo values are **always lowercased** at extraction time so catalog IDs, file paths, and JSON fields
   stay consistent.
@@ -77,8 +87,42 @@
 
 ## Tooling
 
+All tools live in `.venv/`. Activate the virtual environment before running any tool:
+
+```bash
+source .venv/bin/activate
+pre-commit run --all-files   # full hook suite (must all pass before committing)
+pytest                       # full test suite (206 tests, all mocked — no network)
 ```
-pre-commit run --all-files   # isort + black + pylint
-.venv/bin/mypy dfetch_hub    # strict type check
-.venv/bin/pytest             # full test suite
-```
+
+### pre-commit hooks
+
+| Hook | What it enforces |
+|---|---|
+| isort | Import order (stdlib → third-party → local, `profile = "black"`) |
+| black | Code formatting, 120-char lines |
+| ruff | Lint: `A B C4 E F G N PERF PIE PTH RET RSE RUF SIM T20 TCH TRY UP W` |
+| pylint | Structural lint; design limits (see `[tool.pylint.design]` in `pyproject.toml`) |
+| mypy | Strict type checking |
+| pyright | Strict type checking (supplementary; some checks suppressed in `pyproject.toml`) |
+| bandit | Security linting |
+| xenon | Cyclomatic complexity: `--max-absolute B --max-modules A --max-average A` |
+| pyroma | Package metadata quality |
+| djlint | HTML linting |
+| pydocstyle | Docstring style (Google convention) |
+| doc8 | RST/Markdown style, 120-char lines |
+
+### Code quality gotchas
+
+- **pylint `too-many-locals`** counts function *parameters* as locals. Limit is 12 total
+  (params + body variables). Inline a temporary variable or extract a helper to reduce the count.
+- **ruff TC003**: imports from `collections.abc` or `pathlib` used *only in annotations* must live
+  inside the `if TYPE_CHECKING:` block (safe with `from __future__ import annotations`).
+- **ruff PIE807**: use `field(default_factory=list)` / `field(default_factory=dict)`, not lambdas.
+- **xenon average A (≤ 5.0)**: extracting a helper lowers the module average even when the total CC
+  sum stays the same — it adds a new block to the denominator.
+- **suppress comments must be on the same line as the violation**, not on a trailing `)`. Black may
+  move a trailing comment from `_fn(\n  arg\n)  # noqa` to the `)` line, detaching it from the
+  actual offending expression. Put the comment on the opening `_fn(  # noqa` line instead.
+- **`str(None) == "None"`** (truthy): when converting an `object` value to `str | None`, check
+  `value` first — `return str(value) if value else None` — not `s = str(value); return s if s`.
diff --git a/dfetch_hub/catalog/cloner.py b/dfetch_hub/catalog/cloner.py
@@ -63,7 +63,6 @@ def create_manifest(source: SourceConfig, dest_dir: Path) -> Path:
         src=source.path,
         branch=source.branch or "",
         revision="",
-        repo_path="",
         vcs="git",
     )
     manifest_dict = ManifestDict(
@@ -108,12 +107,8 @@ def clone_source(source: SourceConfig, dest_dir: Path) -> Path:
 
     cloned = dest_dir / source.name
     if not cloned.resolve().is_relative_to(dest_dir.resolve()):
-        raise RuntimeError(
-            f"Source name {source.name!r} resolves outside dest_dir {dest_dir}"
-        )
+        raise RuntimeError(f"Source name {source.name!r} resolves outside dest_dir {dest_dir}")
     if not cloned.is_dir():
-        raise RuntimeError(
-            f"Expected dfetch output directory {cloned} not found after update"
-        )
+        raise RuntimeError(f"Expected dfetch output directory {cloned} not found after update")
     logger.debug("Clone complete: %s", cloned)
     return cloned
diff --git a/dfetch_hub/catalog/sources/__init__.py b/dfetch_hub/catalog/sources/__init__.py
@@ -53,15 +53,25 @@ def parse_vcs_slug(url: str) -> tuple[str, str, str] | None:
 
 _HEADERS = {"User-Agent": "dfetch-hub/0.0.1"}
 _README_NAMES = ("README.md", "readme.md", "Readme.md", "README.rst", "README")
-_RAW_BRANCHES = ("main", "master")
+RAW_BRANCHES = ("main", "master")
 
 
-def _raw_url(owner: str, repo: str, branch: str, filename: str) -> str:
-    """Build a raw.githubusercontent.com URL for a specific file."""
+def raw_url(owner: str, repo: str, branch: str, filename: str) -> str:
+    """Build a raw.githubusercontent.com URL for a specific file.
+
+    Args:
+        owner: Repository owner or organization.
+        repo: Repository name.
+        branch: Branch name to fetch from.
+        filename: Filename within the repository root.
+
+    Returns:
+        Raw GitHub content
+    """
     return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{filename}"
 
 
-def _fetch_raw(url: str) -> str | None:
+def fetch_raw(url: str) -> str | None:
     """GET *url* and return the response body as a string, or ``None`` on failure."""
     try:
         req = Request(url, headers=_HEADERS)
@@ -85,9 +95,9 @@ def fetch_readme(owner: str, repo: str) -> str | None:
         The raw README text on success, or ``None`` if nothing is found.
 
     """
-    for branch in _RAW_BRANCHES:
+    for branch in RAW_BRANCHES:
         for name in _README_NAMES:
-            content = _fetch_raw(_raw_url(owner, repo, branch, name))
+            content = fetch_raw(raw_url(owner, repo, branch, name))
             if content is not None:
                 logger.debug("Fetched %s for %s/%s from %s", name, owner, repo, branch)
                 return content

diff --git a/dfetch_hub/catalog/sources/clib.py b/dfetch_hub/catalog/sources/clib.py
@@ -10,12 +10,12 @@
 from dfetch.log import get_logger
 
 from dfetch_hub.catalog.sources import (
-    _RAW_BRANCHES,
+    RAW_BRANCHES,
     BaseManifest,
-    _fetch_raw,
-    _raw_url,
+    fetch_raw,
     fetch_readme,
     parse_vcs_slug,
+    raw_url,
 )
 
 if TYPE_CHECKING:
@@ -56,8 +56,8 @@ def _fetch_package_json(owner: str, repo: str) -> dict[str, object] | None:
 
     Tries ``main`` then ``master`` branch.  Returns ``None`` on failure.
     """
-    for branch in _RAW_BRANCHES:
-        raw = _fetch_raw(_raw_url(owner, repo, branch, "package.json"))
+    for branch in RAW_BRANCHES:
+        raw = fetch_raw(raw_url(owner, repo, branch, "package.json"))
         if raw is None:
             continue
         try:
@@ -92,6 +92,43 @@ def _build_urls(vcs_url: str, canonical_url: str | None) -> dict[str, str]:
     return urls
 
 
+def _str_or_none(value: object) -> str | None:
+    """Return ``str(value)`` when *value* is truthy, else ``None``."""
+    return str(value) if value else None
+
+
+def _pkg_json_keywords(raw: object) -> list[str]:
+    """Extract a flat keyword list from the ``keywords`` field of ``package.json``."""
+    if isinstance(raw, list):
+        return [str(k) for k in raw]
+    if isinstance(raw, str):
+        return [raw]
+    return []
+
+
+def _enrich_from_pkg_json(
+    pkg_json: dict[str, object], vcs_url: str, tagline: str, repo: str
+) -> tuple[str, str, str | None, str | None, list[str], str]:
+    """Extract package metadata from ``package.json``.
+
+    Args:
+        pkg_json: Parsed ``package.json`` dict.
+        vcs_url:  VCS repository URL used as the fallback homepage.
+        tagline:  Description extracted from the wiki bullet (used when ``package.json`` has none).
+        repo:     Repository name used as the fallback package name.
+
+    Returns:
+        ``(package_name, description, license, version, keywords, canonical_url)``
+    """
+    package_name = str(pkg_json.get("name") or repo)
+    description = _str_or_none(pkg_json.get("description")) or tagline or ""
+    license_val = _str_or_none(pkg_json.get("license"))
+    version_val = _str_or_none(pkg_json.get("version"))
+    json_kws = _pkg_json_keywords(pkg_json.get("keywords"))
+    canonical_url = _str_or_none(pkg_json.get("homepage")) or vcs_url
+    return package_name, description, license_val, version_val, json_kws, canonical_url
+
+
 def _build_package(  # pylint: disable=too-many-locals
     host: str, owner: str, repo: str, tagline: str, category: str
 ) -> CLibPackage:
@@ -118,31 +155,20 @@ def _build_package(  # pylint: disable=too-many-locals
     pkg_json = _fetch_package_json(owner, repo) if is_github else None
 
     if pkg_json is not None:
-        package_name = str(pkg_json.get("name") or repo)
-        description = tagline or str(pkg_json.get("description") or "")
-        license_val = str(pkg_json.get("license") or "") or None
-        version_val = str(pkg_json.get("version") or "") or None
-        raw_keywords = pkg_json.get("keywords")
-        if isinstance(raw_keywords, list):
-            json_kws: list[str] = [str(k) for k in raw_keywords]
-        elif isinstance(raw_keywords, str):
-            json_kws = [raw_keywords]
-        else:
-            json_kws = []
-        # Prefer an explicit homepage from package.json (e.g. project website);
-        # fall back to the VCS repo URL so the field is always populated.
-        canonical_url: str | None = str(pkg_json.get("homepage") or "") or vcs_url
+        package_name, description, license_val, version_val, json_kws, canonical_url = _enrich_from_pkg_json(
+            pkg_json, vcs_url, tagline, repo
+        )
     else:
-        package_name = repo
-        description = tagline
-        license_val = None
-        version_val = None
-        json_kws = []
+        package_name, description, license_val, version_val, json_kws = (
+            repo,
+            tagline,
+            None,
+            None,
+            [],
+        )
         canonical_url = vcs_url
 
-    keywords: list[str] = ([category] if category else []) + [
-        k for k in json_kws if k != category
-    ]
+    keywords: list[str] = ([category] if category else []) + [k for k in json_kws if k != category]
     return CLibPackage(
         entry_name=f"{host}/{owner}/{repo}",
         package_name=package_name,
@@ -156,9 +182,36 @@ def _build_package(  # pylint: disable=too-many-locals
     )
 
 
-def parse_packages_md(
-    packages_md: "Path", limit: int | None = None
-) -> list[CLibPackage]:
+def _process_wiki_line(line: str, current_category: str) -> tuple[str, CLibPackage | None]:
+    """Process one Packages.md line; return updated category and optional package.
+
+    Args:
+        line:             A single line from ``Packages.md``.
+        current_category: The most-recently seen section heading.
+
+    Returns:
+        A ``(category, package)`` pair where *category* may be updated and
+        *package* is ``None`` for non-bullet or unrecognised lines.
+    """
+    heading_match = _HEADING_RE.match(line)
+    if heading_match:
+        return heading_match.group(1).strip(), None
+
+    bullet_match = _BULLET_RE.match(line)
+    if not bullet_match:
+        return current_category, None
+
+    _link_text, url, tagline = bullet_match.groups()
+    parsed = parse_vcs_slug(url)
+    if not parsed:
+        logger.debug("Skipping URL without recognized VCS host in Packages.md: %s", url)
+        return current_category, None
+
+    host, owner, repo = parsed
+    return current_category, _build_package(host, owner, repo, (tagline or "").strip(), current_category)
+
+
+def parse_packages_md(packages_md: "Path", limit: int | None = None) -> list[CLibPackage]:
     """Parse ``Packages.md`` from the clib wiki into a list of :class:`CLibPackage`.
 
     For each bullet-point entry the function:
@@ -179,29 +232,10 @@ def parse_packages_md(
     current_category: str = ""
 
     for line in packages_md.read_text(encoding="utf-8").splitlines():
-        heading_match = _HEADING_RE.match(line)
-        if heading_match:
-            current_category = heading_match.group(1).strip()
-            continue
-
-        bullet_match = _BULLET_RE.match(line)
-        if not bullet_match:
-            continue
-
-        _link_text, url, tagline = bullet_match.groups()
-        parsed = parse_vcs_slug(url)
-        if not parsed:
-            logger.debug(
-                "Skipping URL without recognized VCS host in Packages.md: %s", url
-            )
-            continue
-
         if limit is not None and len(packages) >= limit:
             break
-
-        host, owner, repo = parsed
-        packages.append(
-            _build_package(host, owner, repo, (tagline or "").strip(), current_category)
-        )
+        current_category, pkg = _process_wiki_line(line, current_category)
+        if pkg is not None:
+            packages.append(pkg)
 
     return packages
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,4 +11,5 @@ doc/_build @@
     doc/landing-page/_build
     example/Tests/
     venv*
+    .venv
     public