diff --git a/dfetch-hub.toml b/dfetch-hub.toml index 26cb9d0..d683b65 100644 --- a/dfetch-hub.toml +++ b/dfetch-hub.toml @@ -96,8 +96,8 @@ label = "clib" # label = "npm" # # ── EXAMPLE: git-regex — find all repos referenced in a config file ─────────── -# # Clone (shallow) one "index repo" and extract all git URLs from files -# # matching `file_glob`. Useful for Awesome-lists, PlatformIO registries, etc. +# Clone (shallow) one "index repo" and extract all git URLs from files +# matching `file_glob`. Useful for Awesome-lists, PlatformIO registries, etc. # [[source]] # name = "awesome-embedded" # strategy = "git-regex" @@ -108,6 +108,17 @@ label = "clib" # # Optionally restrict to repos whose path matches: # # path_filter = ".*" +# ── EXAMPLE: subfolders — monorepo with readme manifests ─────────────────────── +# A monorepo where packages live in a `packages/` subfolder, each with a README. +# The `manifest = "readme"` tells dfetch-hub to scan for README.* files. +[[source]] +name = "ts-monorepo" +strategy = "subfolders" +url = "https://github.com/bakeruk/modern-typescript-monorepo-example" +path = "packages" +manifest = "readme" +label = "ts-monorepo" + # # ── EXAMPLE: SVN — single repo ──────────────────────────────────────────────── # [[source]] # name = "my-svn-lib" diff --git a/dfetch_hub/catalog/detail.py b/dfetch_hub/catalog/detail.py new file mode 100644 index 0000000..324fc97 --- /dev/null +++ b/dfetch_hub/catalog/detail.py @@ -0,0 +1,345 @@ +"""Catalog detail data model.""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from typing import TYPE_CHECKING, Any + +from dfetch.log import get_logger +from dfetch.vcs.git import GitRemote + +from dfetch_hub.catalog.model import CatalogSource, FetchMetadata, GitRefs, PackageContent, Tag, VCSLocation +from dfetch_hub.catalog.sources import BaseManifest + +if TYPE_CHECKING: + from pathlib import Path + +logger = get_logger(__name__) + + +class CatalogDetail: # pylint: disable=too-many-instance-attributes + """Represents the per-project detail JSON file with rich metadata for a library. + + Each library has a detail file (e.g., "github/abseil/abseil-cpp.json") containing + the canonical URL, which sources provide it, available version tags, installation + README, and license information. This is displayed on the web interface and + used to generate dfetch.yaml snippets. + """ + + def __init__( # pylint: disable=too-many-arguments,disable=too-many-positional-arguments + self, + canonical_url: str = "", + location: VCSLocation | None = None, + catalog_sources: list[CatalogSource] | None = None, + manifests: list[Any] | None = None, + git_refs: GitRefs | None = None, + package_content: PackageContent | None = None, + urls: dict[str, str] | None = None, + fetch_metadata: FetchMetadata | None = None, + ) -> None: + """Initialize a CatalogDetail. + + Args: + canonical_url: Homepage URL. Defaults to empty string. + location: VCS location (org, repo, subpath). Defaults to None. + catalog_sources: List of sources. Defaults to empty list. + manifests: List of manifests. Defaults to empty list. + git_refs: Git tags and branches. Defaults to empty refs. + package_content: README and license text. Defaults to empty content. + urls: Additional URLs dict. Defaults to empty dict. + fetch_metadata: Fetch timestamp metadata. Defaults to empty metadata. + """ + self.canonical_url = canonical_url + self.location = location or VCSLocation(host="", org="", repo="") + self.catalog_sources = catalog_sources or [] + self.manifests = manifests or [] + self.git_refs = git_refs or GitRefs() + self.package_content = package_content or PackageContent() + self.urls = urls or {} + self.fetch_metadata = fetch_metadata or FetchMetadata() + + @property + def fetched_at(self) -> str: + """ISO-formatted fetch timestamp.""" + return self.fetch_metadata.fetched_at + + @fetched_at.setter + def fetched_at(self, value: str) -> None: + """Set the fetch timestamp.""" + self.fetch_metadata.fetched_at = value + + @property + def org(self) -> str: + """Repository organization/owner.""" + return self.location.org + + @org.setter + def org(self, value: str) -> None: + """Set the repository organization/owner.""" + self.location.org = value + + @property + def repo(self) -> str: + """Repository name.""" + return self.location.repo + + @repo.setter + def repo(self, value: str) -> None: + """Set the repository name.""" + self.location.repo = value + + @property + def subfolder_path(self) -> str | None: + """Monorepo subfolder path.""" + return self.location.subpath + + @subfolder_path.setter + def subfolder_path(self, value: str | None) -> None: + """Set the monorepo subfolder path.""" + self.location.subpath = value + + @property + def tags(self) -> list[Tag]: + """List of version tags.""" + return self.git_refs.tags + + @tags.setter + def tags(self, value: list[Tag]) -> None: + """Set the list of version tags.""" + self.git_refs.tags = value + + @property + def branches(self) -> list[Tag]: + """List of branches.""" + return self.git_refs.branches + + @branches.setter + def branches(self, value: list[Tag]) -> None: + """Set the list of branches.""" + self.git_refs.branches = value + + @property + def readme(self) -> str: + """README content.""" + return self.package_content.readme + + @readme.setter + def readme(self, value: str) -> None: + """Set the README content.""" + self.package_content.readme = value + + @property + def license_text(self) -> str | None: + """License text.""" + return self.package_content.license_text + + @license_text.setter + def license_text(self, value: str | None) -> None: + """Set the license text.""" + self.package_content.license_text = value + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this CatalogDetail.""" + return { + "canonical_url": self.canonical_url, + "org": self.location.org, + "repo": self.location.repo, + "subfolder_path": self.location.subpath, + "catalog_sources": [s.to_dict() for s in self.catalog_sources], + "manifests": self.manifests, + "readme": self.package_content.readme, + "tags": self.git_refs.to_dict()["tags"], + "branches": self.git_refs.to_dict()["branches"], + "urls": self.urls, + "license_text": self.package_content.license_text, + "fetched_at": self.fetched_at, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> CatalogDetail: + """Create a CatalogDetail from a dict.""" + return cls( + canonical_url=data.get("canonical_url", ""), + location=VCSLocation( + host="", + org=data.get("org", ""), + repo=data.get("repo", ""), + subpath=data.get("subfolder_path"), + ), + catalog_sources=[CatalogSource.from_dict(s) for s in data.get("catalog_sources", [])], + manifests=list(data.get("manifests", [])), + git_refs=GitRefs.from_dict(data), + package_content=PackageContent( + readme=data.get("readme", ""), + license_text=data.get("license_text"), + ), + urls=dict(data.get("urls", {})), + fetch_metadata=FetchMetadata.from_dict(data), + ) + + @classmethod + def from_manifest( # pylint: disable=too-many-arguments,too-many-positional-arguments + cls, + manifest: BaseManifest, + org: str, + repo: str, + source_name: str, + label: str, + registry_path: str, + ) -> CatalogDetail: + """Create a CatalogDetail from a manifest.""" + readme_content = getattr(manifest, "readme_content", None) + detail = cls( + canonical_url=manifest.homepage or "", + location=VCSLocation(host="", org=org, repo=repo, subpath=manifest.subpath), + catalog_sources=[], + manifests=[], + git_refs=GitRefs(branches=[Tag(name="main", is_tag=False)]), + package_content=PackageContent( + readme=readme_content or cls.generate_readme(manifest, repo, manifest.homepage or "") + ), + urls={}, + fetch_metadata=FetchMetadata(fetched_at=datetime.now(UTC).isoformat()), + ) + detail.add_source(manifest, source_name, label, registry_path) + return detail + + def dump( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, data_dir: Path, vcs_host: str, org: str, repo: str, subpath: str | None + ) -> None: + """Write this detail to the appropriate JSON file.""" + subpath = BaseManifest.sanitize_subpath(subpath) + + if subpath: + detail_path = data_dir / vcs_host / org / repo / f"{subpath}.json" + else: + detail_path = data_dir / vcs_host / org / f"{repo}.json" + + detail_path.parent.mkdir(parents=True, exist_ok=True) + with detail_path.open("w", encoding="utf-8") as fh: + json.dump(self.to_dict(), fh, indent=2, ensure_ascii=False) + fh.write("\n") + + @classmethod + def load(cls, path: Path) -> CatalogDetail | None: + """Load a CatalogDetail from a JSON file, or return None if it doesn't exist.""" + if not path.exists(): + return None + with path.open(encoding="utf-8") as fh: + return cls.from_dict(json.load(fh)) + + @staticmethod + def generate_readme(manifest: BaseManifest, repo: str, url: str) -> str: + """Generate a minimal installation README for a package.""" + local_name = manifest.subpath or repo + src_line = f"\n src: {manifest.subpath}" if manifest.subpath else "" + version_line = f"\n tag: {manifest.version}" if manifest.version else "" + return ( + f"# {manifest.package_name}\n\n" + f"{manifest.description}\n\n" + "## Installation\n\n" + "Add to your `dfetch.yaml`:\n\n" + "```yaml\n" + "projects:\n" + f" - name: ext/{local_name}\n" + f" url: {url}{src_line}{version_line}\n" + "```\n\n" + "## Usage\n\n" + f"After running `dfetch update`, the library will be available at `ext/{local_name}/`.\n" + ) + + def update_tags(self, manifest: BaseManifest) -> None: + """Update tags, fetching from upstream if needed.""" + if not self.tags and manifest.homepage: + self.tags.extend(self.fetch_upstream_tags(manifest.homepage)) + + if manifest.version: + tag_names_normalised = {t.name.lstrip("v") for t in self.tags} + if manifest.version.lstrip("v") not in tag_names_normalised: + self.tags.insert( + 0, + Tag( + name=manifest.version, + is_tag=True, + commit_sha=None, + date=None, + ), + ) + + @staticmethod + def fetch_upstream_tags(url: str) -> list[Tag]: + """Return git tags from url using dfetch's GitRemote.""" + try: + info = GitRemote._ls_remote(url) # pyright: ignore[reportPrivateUsage] # pylint: disable=protected-access + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning("Could not list tags for %s: %s", url, exc) # pragma: no cover + return [] # pragma: no cover + + return [ + Tag( + name=ref.replace("refs/tags/", ""), + is_tag=True, + commit_sha=sha, + date=None, + ) + for ref, sha in info.items() + if ref.startswith("refs/tags/") + ] + + def add_source( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + manifest: BaseManifest, + source_name: str, + label: str, + registry_path: str, + ) -> None: + """Add or update a catalog source entry.""" + new_source = CatalogSource( + source_name=source_name, + label=label, + index_path=f"{registry_path}/{manifest.entry_name}", + registry_version=manifest.version, + ) + new_index_path = new_source.index_path + + # Purge stale entries with same index_path but different source_name + self.catalog_sources = [ + s for s in self.catalog_sources if not (s.index_path == new_index_path and s.source_name != source_name) + ] + + # Update existing or append new + for s in self.catalog_sources: + if s.source_name == source_name: + s.index_path = new_source.index_path + s.label = new_source.label + s.registry_version = new_source.registry_version + return + + self.catalog_sources.append(new_source) + + def update_from_manifest( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + manifest: BaseManifest, + repo: str, + source_name: str, + label: str, + registry_path: str, + ) -> None: + """Update this detail from a manifest (merges all data).""" + self.merge_from_manifest(manifest, repo) + self.add_source(manifest, source_name, label, registry_path) + self.update_tags(manifest) + + def merge_from_manifest(self, manifest: BaseManifest, repo: str) -> None: + """Merge data from a manifest into this detail.""" + readme_content = getattr(manifest, "readme_content", None) + if readme_content: + self.readme = readme_content + elif not self.readme: + self.readme = self.generate_readme(manifest, repo, manifest.homepage or "") + + self.urls.update(getattr(manifest, "urls", {})) + + if manifest.subpath: + self.subfolder_path = manifest.subpath diff --git a/dfetch_hub/catalog/entry.py b/dfetch_hub/catalog/entry.py new file mode 100644 index 0000000..27902cd --- /dev/null +++ b/dfetch_hub/catalog/entry.py @@ -0,0 +1,179 @@ +"""Catalog entry data model.""" + +from __future__ import annotations + +from datetime import UTC, datetime +from typing import TYPE_CHECKING, Any, ClassVar + +from dfetch_hub.catalog.model import Tag, VCSLocation + +if TYPE_CHECKING: + from dfetch_hub.catalog.sources import BaseManifest + + +class CatalogEntry: # pylint: disable=too-many-instance-attributes,too-many-locals + """Represents a single library in the catalog.json index. + + Each entry describes a library available from one or more package sources. + The catalog ID (e.g., "github/abseil/abseil-cpp") uniquely identifies it. + For monorepos, subpaths are included: "github/org/monorepo/mylib". + """ + + VCS_HOST_ALIASES: ClassVar[dict[str, str]] = { + "github.com": "github", + "gitlab.com": "gitlab", + "bitbucket.org": "bitbucket", + } + + def __init__( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + cat_id: str = "", + name: str = "", + description: str | None = None, + url: str = "", + source_type: str = "github", + default_branch: str = "main", + license_str: str | None = None, + topics: list[str] | None = None, + stars: int = 0, + last_updated: str = "", + source_labels: list[str] | None = None, + tags: list[Tag] | None = None, + ) -> None: + """Initialize a CatalogEntry. + + Args: + cat_id: Unique catalog identifier. + name: Package name. + description: Brief description. Defaults to None. + url: Homepage URL. Defaults to empty string. + source_type: VCS host type. Defaults to "github". + default_branch: Default branch name. Defaults to "main". + license_str: SPDX license identifier. Defaults to None. + topics: List of GitHub topics. Defaults to None. + stars: Star count. Defaults to 0. + last_updated: ISO-formatted update timestamp. Defaults to empty string. + source_labels: List of source names. Defaults to None. + tags: List of version tags. Defaults to None. + """ + self.id = cat_id + self.name = name + self.description = description + self.url = url + self.source_type = source_type + self.default_branch = default_branch + self.license = license_str + self.topics = topics or [] + self.stars = stars + self.last_updated = last_updated + self.source_labels = source_labels or [] + self.tags = tags or [] + + @staticmethod + def vcs_host_label(host: str) -> str: + """Return a short, filesystem-safe label for a VCS hostname.""" + return CatalogEntry.VCS_HOST_ALIASES.get(host, host) + + @staticmethod + def catalog_id(vcs_host: str, org: str, repo: str, subpath: str | None = None) -> str: + """Return the catalog ID string for a package.""" + base = f"{vcs_host.lower()}/{org.lower()}/{repo.lower()}" + return f"{base}/{subpath.lower()}" if subpath else base + + @classmethod + def from_manifest( # pylint: disable=too-many-arguments,too-many-positional-arguments + cls, + manifest: BaseManifest, + vcs_host: str, + org: str, + repo: str, + label: str, + ) -> CatalogEntry: + """Create a CatalogEntry from a manifest.""" + subpath: str | None = manifest.subpath + entry = cls( + cat_id=VCSLocation(vcs_host, org, repo, subpath).catalog_id, + name=manifest.package_name, + description=manifest.description, + url=manifest.homepage or "", + source_type=vcs_host, + default_branch="main", + license_str=manifest.license, + topics=list(getattr(manifest, "topics", [])), + stars=0, + last_updated=datetime.now(UTC).isoformat(), + source_labels=[label], + tags=[], + ) + if manifest.version: + entry.update_tags(manifest.version) + return entry + + def merge_from_manifest(self, manifest: BaseManifest, is_update: bool, label: str) -> None: + """Merge data from a manifest into this entry.""" + self.merge_topics(is_update, list(getattr(manifest, "topics", []))) + + if manifest.description and not self.description: + self.description = manifest.description + if manifest.license and not self.license: + self.license = manifest.license + + if label not in self.source_labels: + self.source_labels.append(label) + + if manifest.version: + self.update_tags(manifest.version) + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this CatalogEntry.""" + return { + "id": self.id, + "name": self.name, + "description": self.description, + "url": self.url, + "source_type": self.source_type, + "default_branch": self.default_branch, + "license": self.license, + "topics": self.topics, + "stars": self.stars, + "last_updated": self.last_updated, + "source_labels": self.source_labels, + "tags": [t.to_dict() for t in self.tags], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> CatalogEntry: + """Create a CatalogEntry from a dict.""" + return cls( + cat_id=data.get("id", ""), + name=data.get("name", ""), + description=data.get("description"), + url=data.get("url", ""), + source_type=data.get("source_type", "github"), + default_branch=data.get("default_branch", "main"), + license_str=data.get("license"), + topics=list(data.get("topics", [])), + stars=data.get("stars", 0), + last_updated=data.get("last_updated", ""), + source_labels=list(data.get("source_labels", [])), + tags=[Tag.from_dict(t) for t in data.get("tags", [])], + ) + + def merge_topics(self, is_update: bool, topics: list[str]) -> None: + """Merge *topics* into this entry when updating an existing entry.""" + if is_update and topics: + self.topics.extend(t for t in topics if t not in self.topics) + + def update_tags(self, version: str) -> None: + """Update tags with the given version.""" + tag_names_normalised = {t.name.lstrip("v") for t in self.tags} + if version.lstrip("v") not in tag_names_normalised: + self.tags.insert( + 0, + Tag( + name=version, + is_tag=True, + commit_sha=None, + date=None, + ), + ) diff --git a/dfetch_hub/catalog/model.py b/dfetch_hub/catalog/model.py new file mode 100644 index 0000000..bcd5e7d --- /dev/null +++ b/dfetch_hub/catalog/model.py @@ -0,0 +1,142 @@ +"""Tag and source data models for the catalog.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class VCSLocation: + """Represents a VCS repository location. + + Attributes: + host: The VCS host label (e.g., "github", "gitlab"). + org: The organization/owner name. + repo: The repository name. + subpath: Optional subpath for monorepo components. + """ + + host: str + org: str + repo: str + subpath: str | None = None + + @property + def catalog_id(self) -> str: + """Return the catalog ID for this location.""" + base = f"{self.host.lower()}/{self.org.lower()}/{self.repo.lower()}" + return f"{base}/{self.subpath.lower()}" if self.subpath else base + + +@dataclass +class GitRefs: + """Represents git references (tags and branches) from a repository.""" + + tags: list[Tag] = field(default_factory=list) + branches: list[Tag] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this GitRefs.""" + return { + "tags": [t.to_dict() for t in self.tags], + "branches": [t.to_dict() for t in self.branches], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> GitRefs: + """Create a GitRefs from a dict.""" + return cls( + tags=[Tag.from_dict(t) for t in data.get("tags", [])], + branches=[Tag.from_dict(t) for t in data.get("branches", [])], + ) + + +@dataclass +class PackageContent: + """Represents package content fetched from the repository.""" + + readme: str = "" + license_text: str | None = None + + +@dataclass +class FetchMetadata: + """Represents metadata about when the package was fetched.""" + + fetched_at: str = "" + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation.""" + return {"fetched_at": self.fetched_at} + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> FetchMetadata: + """Create from dict.""" + return cls(fetched_at=data.get("fetched_at", "")) + + +@dataclass +class Tag: + """Represents a Git tag or branch reference from a remote repository. + + Tags identify specific versions (e.g., "v1.2.3") while branches track lines + of development. Both are fetched via ``git ls-remote`` when updating the catalog. + """ + + name: str + is_tag: bool = True + commit_sha: str | None = None + date: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this Tag.""" + return { + "name": self.name, + "is_tag": self.is_tag, + "commit_sha": self.commit_sha, + "date": self.date, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> Tag: + """Create a Tag from a dict.""" + return cls( + name=data.get("name", ""), + is_tag=data.get("is_tag", True), + commit_sha=data.get("commit_sha"), + date=data.get("date"), + ) + + +@dataclass +class CatalogSource: + """Represents a package source within a project's detail JSON. + + Each library can be available from multiple package managers (vcpkg, Conan, clib, etc.). + This tracks which sources include the library and where their registry entry lives. + """ + + source_name: str + label: str + index_path: str + registry_version: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this CatalogSource.""" + return { + "source_name": self.source_name, + "label": self.label, + "index_path": self.index_path, + "registry_version": self.registry_version, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> CatalogSource: + """Create a CatalogSource from a dict.""" + return cls( + source_name=data.get("source_name", ""), + label=data.get("label", ""), + index_path=data.get("index_path", ""), + registry_version=data.get("registry_version"), + ) diff --git a/dfetch_hub/catalog/sources/__init__.py b/dfetch_hub/catalog/sources/__init__.py index 788fae0..e6312e3 100644 --- a/dfetch_hub/catalog/sources/__init__.py +++ b/dfetch_hub/catalog/sources/__init__.py @@ -1,162 +1,234 @@ -"""Format-specific source parsers (vcpkg, conan, clib).""" - -from __future__ import annotations - -import re -from dataclasses import dataclass, field -from urllib.error import URLError -from urllib.request import Request, urlopen - -from dfetch.log import get_logger - -logger = get_logger(__name__) - - -# --------------------------------------------------------------------------- -# VCS URL helpers -# --------------------------------------------------------------------------- - -# Matches any https://host/[groups.../]repo[.git] URL regardless of hosting provider. -# The owner group captures everything between the host and the final path segment, -# so GitLab nested groups (group/subgroup/…) are preserved intact. -_VCS_URL_RE = re.compile( - r"https?://([^/]+)/(.+)/([^/\s#?]+?)(?:\.git)?/?$", - re.IGNORECASE, -) - - -def parse_vcs_slug(url: str) -> tuple[str, str, str] | None: - """Return ``(host, owner, repo)`` extracted from a VCS URL, normalised to lowercase. - - Works with any ``https://host/owner/repo`` URL — GitHub, GitLab, Gitea, - Bitbucket, and company-hosted instances. For GitLab (and similar hosts) - the *owner* component may contain slashes representing nested groups, e.g. - ``"group/subgroup"`` for ``https://gitlab.com/group/subgroup/repo``. - Lowercasing ensures the catalog ID, the detail-file path, and the JSON - fields are all consistent. - - Args: - url: A VCS repository URL. - - Returns: - A ``(host, owner, repo)`` triple, or ``None`` if *url* does not match - the expected ``https://host/…/repo`` pattern. - - """ - m = _VCS_URL_RE.match(url.strip()) - return (m.group(1).lower(), m.group(2).lower(), m.group(3).lower()) if m else None - - -# --------------------------------------------------------------------------- -# README fetching -# --------------------------------------------------------------------------- - -_HEADERS = {"User-Agent": "dfetch-hub/0.0.1"} -_README_NAMES = ("README.md", "readme.md", "Readme.md", "README.rst", "README") -RAW_BRANCHES = ("main", "master") - - -def raw_url(owner: str, repo: str, branch: str, filename: str) -> str: - """Build a raw.githubusercontent.com URL for a specific file. - - Args: - owner: Repository owner or organization. - repo: Repository name. - branch: Branch name to fetch from. - filename: Filename within the repository root. - - Returns: - Raw GitHub content - """ - return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{filename}" - - -def fetch_raw(url: str) -> str | None: - """GET *url* and return the response body as a string, or ``None`` on failure.""" - try: - req = Request(url, headers=_HEADERS) - with urlopen(req, timeout=10) as resp: # nosec B310 - return str(resp.read().decode(errors="replace")) - except (URLError, OSError) as exc: - logger.debug("GET %s failed: %s", url, exc) - return None - - -def fetch_readme(owner: str, repo: str) -> str | None: - """Fetch the README from a GitHub repository. - - Tries ``main`` then ``master`` branch, and several common README filenames. - - Args: - owner: GitHub organisation or username. - repo: Repository name. - - Returns: - The raw README text on success, or ``None`` if nothing is found. - - """ - for branch in RAW_BRANCHES: - for name in _README_NAMES: - content = fetch_raw(raw_url(owner, repo, branch, name)) - if content is not None: - logger.debug("Fetched %s for %s/%s from %s", name, owner, repo, branch) - return content - return None - - -def fetch_readme_for_homepage(homepage: str | None) -> str | None: - """Fetch the README for a package given its homepage URL. - - Extracts the VCS host/owner/repo from *homepage* and delegates to - :func:`fetch_readme` when the host is ``github.com``. Returns ``None`` - for ``None`` input, non-VCS URLs, or non-GitHub hosts (README fetching - via raw content URLs is currently only implemented for GitHub). - - Args: - homepage: Upstream project URL (may be ``None`` or a non-GitHub URL). - - Returns: - The raw README text on success, or ``None``. - - """ - if not homepage: - return None - parsed = parse_vcs_slug(homepage) - if parsed is None or parsed[0] != "github.com": - return None - _, owner, repo = parsed - return fetch_readme(owner, repo) - - -# --------------------------------------------------------------------------- -# Base data model -# --------------------------------------------------------------------------- - - -@dataclass -class BaseManifest: - """Shared base fields for all catalog manifest dataclasses. - - Attributes: - entry_name: Unique identifier within the source registry. - package_name: Human-readable package name (may differ from entry_name). - description: Short description of the package. - homepage: Upstream project URL, or ``None`` if unknown. - license: SPDX license expression, or ``None`` if unspecified. - version: Latest version string, or ``None`` if unavailable. - readme_content: Raw README text fetched from the upstream repo, or ``None``. - urls: Named URLs for the package (e.g. ``{"Homepage": "...", - "Source": "..."``). Modelled on ``[project.urls]`` in - ``pyproject.toml``. Parsers populate this with every URL - they can discover; the catalog detail JSON exposes the full - dict so the frontend can render all links. - """ - - entry_name: str - package_name: str - description: str - homepage: str | None - license: str | None - version: str | None - readme_content: str | None = None - urls: dict[str, str] = field(default_factory=dict) +"""Format-specific source parsers (vcpkg, conan, clib).""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from urllib.error import URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen + +from dfetch.log import get_logger + +logger = get_logger(__name__) + + +# --------------------------------------------------------------------------- +# VCS URL helpers +# --------------------------------------------------------------------------- + +# Path segments that signal the end of the ``owner/repo`` portion of a VCS URL. +# For example ``/tree/``, ``/blob/``, ``/src/``, or GitLab's ``/-/`` separator. +_TRAILING_MARKERS: frozenset[str] = frozenset({"tree", "blob", "src", "-"}) + + +def parse_vcs_slug(url: str) -> tuple[str, str, str] | None: + """Return ``(host, owner, repo)`` extracted from a VCS URL, normalised to lowercase. + + Works with any ``https://host/owner/repo`` URL — GitHub, GitLab, Gitea, + Bitbucket, and company-hosted instances. For GitLab (and similar hosts) + the *owner* component may contain slashes representing nested groups, e.g. + ``"group/subgroup"`` for ``https://gitlab.com/group/subgroup/repo``. + Trailing path components that indicate sub-tree navigation (``/tree/``, + ``/blob/``, ``/src/``, ``/-/``) are stripped so that component URLs like + ``https://github.com/owner/repo/tree/main/src`` are correctly parsed. + Lowercasing ensures the catalog ID, the detail-file path, and the JSON + fields are all consistent. + + Args: + url: A VCS repository URL. + + Returns: + A ``(host, owner, repo)`` triple, or ``None`` if *url* does not match + the expected ``https://host/…/repo`` pattern. + + """ + parsed = urlparse(url.strip()) + if parsed.scheme not in ("http", "https"): + return None + netloc = parsed.netloc.lower() + if not netloc: + return None + + segments = [s for s in parsed.path.split("/") if s] + if len(segments) < 2: + return None + + # Truncate at the first trailing marker (e.g. /tree/, /blob/, /-/). + # Only look from index 2 onwards so owner and repo are always present. + repo_end = len(segments) + for i in range(2, len(segments)): + if segments[i] in _TRAILING_MARKERS: + repo_end = i + break + + path_segments = segments[:repo_end] + repo = path_segments[-1].removesuffix(".git") + owner = "/".join(path_segments[:-1]) + return netloc, owner.lower(), repo.lower() + + +# --------------------------------------------------------------------------- +# README fetching +# --------------------------------------------------------------------------- + +_HEADERS = {"User-Agent": "dfetch-hub/0.0.1"} +_README_NAMES = ("README.md", "readme.md", "Readme.md", "README.rst", "README") +RAW_BRANCHES = ("main", "master") + + +def raw_url(owner: str, repo: str, branch: str, filename: str) -> str: + """Build a raw.githubusercontent.com URL for a specific file. + + Args: + owner: Repository owner or organization. + repo: Repository name. + branch: Branch name to fetch from. + filename: Filename within the repository root. + + Returns: + Raw GitHub content + """ + return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{filename}" + + +def fetch_raw(url: str) -> str | None: + """GET *url* and return the response body as a string, or ``None`` on failure.""" + scheme = urlparse(url).scheme + if scheme not in ("http", "https"): + logger.debug("GET %s skipped: unsupported scheme %r", url, scheme) + return None + try: + req = Request(url, headers=_HEADERS) + with urlopen(req, timeout=10) as resp: # nosec B310 + return str(resp.read().decode(errors="replace")) + except (URLError, OSError) as exc: + logger.debug("GET %s failed: %s", url, exc) + return None + + +def fetch_readme(owner: str, repo: str) -> str | None: + """Fetch the README from a GitHub repository. + + Tries ``main`` then ``master`` branch, and several common README filenames. + + Args: + owner: GitHub organisation or username. + repo: Repository name. + + Returns: + The raw README text on success, or ``None`` if nothing is found. + + """ + for branch in RAW_BRANCHES: + for name in _README_NAMES: + content = fetch_raw(raw_url(owner, repo, branch, name)) + if content is not None: + logger.debug("Fetched %s for %s/%s from %s", name, owner, repo, branch) + return content + return None + + +def fetch_readme_for_homepage(homepage: str | None) -> str | None: + """Fetch the README for a package given its homepage URL. + + Extracts the VCS host/owner/repo from *homepage* and delegates to + :func:`fetch_readme` when the host is ``github.com``. Returns ``None`` + for ``None`` input, non-VCS URLs, or non-GitHub hosts (README fetching + via raw content URLs is currently only implemented for GitHub). + + Args: + homepage: Upstream project URL (may be ``None`` or a non-GitHub URL). + + Returns: + The raw README text on success, or ``None``. + + """ + if not homepage: + return None + parsed = parse_vcs_slug(homepage) + if parsed is None or parsed[0] != "github.com": + return None + _, owner, repo = parsed + return fetch_readme(owner, repo) + + +# --------------------------------------------------------------------------- +# Base data model +# --------------------------------------------------------------------------- + + +@dataclass +class BaseManifest: # pylint: disable=too-many-instance-attributes + """Shared base fields for all catalog manifest dataclasses. + + Attributes: + entry_name: Unique identifier within the source registry. + package_name: Human-readable package name (may differ from entry_name). + description: Short description of the package. + homepage: Upstream project URL, or ``None`` if unknown. + license: SPDX license expression, or ``None`` if unspecified. + version: Latest version string, or ``None`` if unavailable. + readme_content: Raw README text fetched from the upstream repo, or ``None``. + urls: Named URLs for the package (e.g. ``{"Homepage": "...", + "Source": "..."``). Modelled on ``[project.urls]`` in + ``pyproject.toml``. Parsers populate this with every URL + they can discover; the catalog detail JSON exposes the full + dict so the frontend can render all links. + subpath: Subdirectory path within the source repository for this + component (e.g. ``"mylib"`` for a monorepo package at + ``repo/mylib``). ``None`` when the manifest represents + the repository root. Used to disambiguate catalog IDs and + detail-file paths for monorepos that contain multiple + components sharing the same repository URL. + in_project_repo: ``True`` when this manifest file resides within the same + repository as the project it describes (e.g. a README that + is part of the monorepo component it documents). ``False`` + (default) when the manifest is a registry entry that points + to an external project living in a separate repository (e.g. + a ``vcpkg.json`` or ``conanfile.py`` in a central registry). + Only manifests with ``in_project_repo=True`` should have + their ``subpath`` derived from the containing directory name. + """ + + entry_name: str + package_name: str + description: str + homepage: str | None + license: str | None + version: str | None + readme_content: str | None = None + urls: dict[str, str] = field(default_factory=dict) + subpath: str | None = None + in_project_repo: bool = False + + @property + def sanitized_subpath(self) -> str | None: + """Return the sanitized subpath to prevent path traversal. + + Returns: + The sanitized subpath, or None if invalid. + """ + return self.sanitize_subpath(self.subpath) + + @staticmethod + def sanitize_subpath(subpath: str | None) -> str | None: + """Validate and sanitize a subpath to prevent path traversal. + + Args: + subpath: The subpath to validate. + + Returns: + The sanitized subpath, or None if invalid. + """ + if not subpath: + return None + + subpath = subpath.replace("\\", "/").strip("/") + + if not subpath or subpath.startswith("."): + return None + + if any(part == ".." for part in subpath.split("/")): + return None + + return subpath diff --git a/dfetch_hub/catalog/sources/clib.py b/dfetch_hub/catalog/sources/clib.py index 2e467c7..d0af09f 100644 --- a/dfetch_hub/catalog/sources/clib.py +++ b/dfetch_hub/catalog/sources/clib.py @@ -179,6 +179,7 @@ def _build_package( # pylint: disable=too-many-locals keywords=keywords, readme_content=fetch_readme(owner, repo) if is_github else None, urls=_build_urls(vcs_url, canonical_url), + in_project_repo=is_github, ) diff --git a/dfetch_hub/catalog/sources/conan.py b/dfetch_hub/catalog/sources/conan.py index 94b6a58..23dc34a 100644 --- a/dfetch_hub/catalog/sources/conan.py +++ b/dfetch_hub/catalog/sources/conan.py @@ -284,4 +284,5 @@ def parse_conan_recipe(recipe_dir: Path) -> ConanManifest | None: topics=topics, readme_content=fetch_readme_for_homepage(homepage), urls=_build_conan_urls(homepage, _extract_str_attr(text, "url")), + in_project_repo=False, ) diff --git a/dfetch_hub/catalog/sources/readme.py b/dfetch_hub/catalog/sources/readme.py index 012137c..f0c2afa 100644 --- a/dfetch_hub/catalog/sources/readme.py +++ b/dfetch_hub/catalog/sources/readme.py @@ -97,6 +97,7 @@ def parse_readme_dir(entry_dir: Path) -> BaseManifest | None: license=None, version=None, readme_content=text, + in_project_repo=True, ) logger.debug("No README found in %s — skipped", entry_dir) return None diff --git a/dfetch_hub/catalog/sources/vcpkg.py b/dfetch_hub/catalog/sources/vcpkg.py index c4f5cff..f96ebd7 100644 --- a/dfetch_hub/catalog/sources/vcpkg.py +++ b/dfetch_hub/catalog/sources/vcpkg.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import re from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -15,6 +16,16 @@ logger = get_logger(__name__) +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Both naming conventions appear in the wild; check the standard name first. +_PORTFILE_NAMES = ("portfile.cmake", "port.cmake") + +# Matches vcpkg_from_github(... REPO owner/repo ...) across multiple lines. +_FROM_GITHUB_RE = re.compile(r"\bvcpkg_from_github\b.*?\bREPO\s+([\w.\-]+/[\w.\-]+)", re.DOTALL) + @dataclass class VcpkgManifest(BaseManifest): @@ -83,6 +94,32 @@ def _extract_dependencies(data: dict[str, object]) -> list[str]: return deps +def _github_url_from_portfile(entry_dir: Path) -> str | None: + """Extract the upstream GitHub repository URL from a vcpkg portfile. + + Looks for a ``portfile.cmake`` (or ``port.cmake``) in *entry_dir* and scans + it for a ``vcpkg_from_github(REPO owner/repo …)`` call. + + Args: + entry_dir: Port directory to scan (e.g. ``ports/abseil``). + + Returns: + A ``https://github.com/owner/repo`` URL, or ``None`` if the portfile + is absent, unreadable, or contains no ``vcpkg_from_github`` call. + """ + for name in _PORTFILE_NAMES: + portfile = entry_dir / name + if portfile.is_file(): + try: + text = portfile.read_text(encoding="utf-8", errors="replace") + except OSError: + return None + match = _FROM_GITHUB_RE.search(text) + if match: + return f"https://github.com/{match.group(1)}" + return None + + def parse_vcpkg_json(entry_dir: Path) -> VcpkgManifest | None: """Parse the ``vcpkg.json`` inside *entry_dir*. @@ -110,7 +147,7 @@ def parse_vcpkg_json(entry_dir: Path) -> VcpkgManifest | None: return None data: dict[str, object] = loaded - homepage = _extract_str_field(data, "homepage") + homepage = _extract_str_field(data, "homepage") or _github_url_from_portfile(entry_dir) license_val = _extract_str_field(data, "license") package_name = _extract_str_field(data, "name") or entry_dir.name @@ -128,4 +165,5 @@ def parse_vcpkg_json(entry_dir: Path) -> VcpkgManifest | None: dependencies=_extract_dependencies(data), readme_content=fetch_readme_for_homepage(homepage), urls=urls, + in_project_repo=False, ) diff --git a/dfetch_hub/catalog/writer.py b/dfetch_hub/catalog/writer.py index cae196e..ec31240 100644 --- a/dfetch_hub/catalog/writer.py +++ b/dfetch_hub/catalog/writer.py @@ -1,429 +1,246 @@ -"""Write parsed package manifests to catalog.json and per-project detail JSONs.""" - -from __future__ import annotations - -import json -from datetime import UTC, datetime -from typing import TYPE_CHECKING, Any - -from dfetch.log import get_logger -from dfetch.vcs.git import GitRemote - -from dfetch_hub.catalog.sources import BaseManifest, parse_vcs_slug - -if TYPE_CHECKING: - from pathlib import Path - -logger = get_logger(__name__) - - -# --------------------------------------------------------------------------- -# VCS host normalisation -# --------------------------------------------------------------------------- - -_VCS_HOST_ALIASES: dict[str, str] = { - "github.com": "github", - "gitlab.com": "gitlab", - "bitbucket.org": "bitbucket", -} - - -def _vcs_host_label(host: str) -> str: - """Return a short, filesystem-safe label for a VCS hostname. - - Well-known public hosts are mapped to their common short name - (``github``, ``gitlab``, ``bitbucket``). Unknown hosts (e.g. - self-hosted Gitea instances) are used verbatim. - - Args: - host: Lowercased hostname extracted from a VCS URL. - - Returns: - A short label string suitable for use in catalog IDs and directory names. - - """ - return _VCS_HOST_ALIASES.get(host, host) - - -def _catalog_id(vcs_host: str, org: str, repo: str) -> str: - return f"{vcs_host.lower()}/{org.lower()}/{repo.lower()}" - - -def _fetch_upstream_tags(url: str) -> list[dict[str, Any]]: - """Return git tags from *url* using dfetch's GitRemote.""" - try: - info = GitRemote._ls_remote(url) # pylint: disable=protected-access # pyright: ignore[reportPrivateUsage] - except Exception as exc: # pylint: disable=broad-exception-caught - logger.warning("Could not list tags for %s: %s", url, exc) # pragma: no cover - return [] # pragma: no cover - - return [ - { - "name": ref.replace("refs/tags/", ""), - "is_tag": True, - "commit_sha": sha, - "date": None, - } - for ref, sha in info.items() - if ref.startswith("refs/tags/") - ] - - -# --------------------------------------------------------------------------- -# catalog.json helpers -# --------------------------------------------------------------------------- - - -def _load_json(path: Path) -> Any: - if path.exists(): - with path.open(encoding="utf-8") as fh: - return json.load(fh) - return None - - -def _save_json(path: Path, data: Any) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as fh: - json.dump(data, fh, indent=2, ensure_ascii=False) - fh.write("\n") - - -def _now_iso() -> str: - return datetime.now(UTC).isoformat() - - -# --------------------------------------------------------------------------- -# Catalog entry (catalog.json) -# --------------------------------------------------------------------------- - - -def _ensure_version_tag(tags: list[dict[str, Any]], version: str) -> None: - """Add *version* to *tags* if not already present. - - Normalises by stripping a leading ``"v"`` so ``"6.4.0"`` matches ``"v6.4.0"``. - - Args: - tags: Mutable tag list to update in-place. - version: Version string to ensure is present. - """ - tag_names_normalised = {str(t.get("name") or "").lstrip("v") for t in tags} - if version.lstrip("v") not in tag_names_normalised: - tags.insert( - 0, - { - "name": version, - "is_tag": True, - "commit_sha": None, - "date": None, - }, - ) - - -def _merge_topics( - entry: dict[str, Any], - existing: dict[str, Any] | None, - topics: list[str], -) -> None: - """Merge *topics* into ``entry["topics"]`` when updating an existing entry. - - No-op when *existing* is ``None`` (newly created entry already has the topics - embedded in the initial dict) or *topics* is empty. - - Args: - entry: Catalog entry dict to update in-place. - existing: Previous value of the entry, or ``None`` if newly created. - topics: Topics from the current manifest to add. - """ - if existing and topics: - existing_topics: list[str] = entry.setdefault("topics", []) - existing_topics.extend(t for t in topics if t not in existing_topics) - - -def _merge_catalog_entry( # pylint: disable=too-many-arguments,too-many-positional-arguments - existing: dict[str, Any] | None, - manifest: BaseManifest, - vcs_host: str, - org: str, - repo: str, - label: str, -) -> dict[str, Any]: - """Create or update a catalog.json entry for this package.""" - topics: list[str] = list(getattr(manifest, "topics", [])) - entry: dict[str, Any] = existing or { - "id": _catalog_id(vcs_host, org, repo), - "name": manifest.package_name, - "description": manifest.description, - "url": manifest.homepage or "", - "source_type": vcs_host, - "default_branch": "main", - "license": manifest.license, - "topics": topics, - "stars": 0, - "last_updated": _now_iso(), - "source_labels": [], - "tags": [], - } - - _merge_topics(entry, existing, topics) - - # Update fields that the manifest knows about and the catalog may be stale on - if manifest.description and not entry.get("description"): - entry["description"] = manifest.description - if manifest.license and not entry.get("license"): - entry["license"] = manifest.license - - # Ensure our label is in source_labels - labels: list[str] = entry.setdefault("source_labels", []) - if label not in labels: - labels.append(label) - - if manifest.version: - _ensure_version_tag(entry.setdefault("tags", []), manifest.version) - - return entry - - -# --------------------------------------------------------------------------- -# Detail JSON (data///.json) -# --------------------------------------------------------------------------- - - -def _catalog_source_entry( - manifest: BaseManifest, - source_name: str, - label: str, - registry_path: str, -) -> dict[str, Any]: - return { - "source_name": source_name, - "label": label, - "index_path": f"{registry_path}/{manifest.entry_name}", - "registry_version": manifest.version, - } - - -def _merge_catalog_sources( - detail: dict[str, Any], - manifest: BaseManifest, - source_name: str, - label: str, - registry_path: str, -) -> None: - """Update the ``catalog_sources`` list in *detail* for this manifest. - - Purges stale entries that share the same ``index_path`` but carry an - outdated ``source_name`` (e.g. after a source rename in ``dfetch-hub.toml``), - then upserts the current source entry. - - Args: - detail: Per-project detail dict to update in-place. - manifest: Package manifest supplying entry metadata. - source_name: Internal name of the catalog source. - label: Human-readable label for the source. - registry_path: Sub-path used to build the ``index_path``. - """ - sources: list[dict[str, Any]] = detail.setdefault("catalog_sources", []) - new_source = _catalog_source_entry(manifest, source_name, label, registry_path) - new_index_path = new_source["index_path"] - detail["catalog_sources"] = sources = [ - s for s in sources if not (s.get("index_path") == new_index_path and s.get("source_name") != source_name) - ] - existing_source = next((s for s in sources if s.get("source_name") == source_name), None) - if existing_source is None: - sources.append(new_source) - else: - existing_source.update(new_source) - - -def _merge_detail( # pylint: disable=too-many-arguments,too-many-positional-arguments - existing: dict[str, Any] | None, - manifest: BaseManifest, - org: str, - repo: str, - source_name: str, - label: str, - registry_path: str, -) -> dict[str, Any]: - """Create or update a per-project detail JSON.""" - fetched_readme: str | None = getattr(manifest, "readme_content", None) - detail: dict[str, Any] = existing or { - "canonical_url": manifest.homepage or "", - "org": org, - "repo": repo, - "subfolder_path": None, - "catalog_sources": [], - "manifests": [], - "readme": fetched_readme or _generate_readme(manifest, repo, manifest.homepage or ""), - "tags": [], - "branches": [ - {"name": "main", "is_tag": False, "commit_sha": None, "date": None}, - ], - "urls": {}, - "license_text": None, - "fetched_at": _now_iso(), - } - - # When we have a real upstream README, always overwrite the placeholder - if fetched_readme: - detail["readme"] = fetched_readme - - # Merge named URLs from this manifest into the detail's url map - detail.setdefault("urls", {}).update(getattr(manifest, "urls", {})) - - _merge_catalog_sources(detail, manifest, source_name, label, registry_path) - - # Populate tags from the upstream repo when the list is empty - tags: list[dict[str, Any]] = detail.setdefault("tags", []) - if not tags and manifest.homepage: - tags.extend(_fetch_upstream_tags(manifest.homepage)) - - if manifest.version: - _ensure_version_tag(tags, manifest.version) - - return detail - - -def _generate_readme(manifest: BaseManifest, repo: str, url: str) -> str: - """Generate a minimal installation README for a package. - - Args: - manifest: Package metadata supplying name, description, and version. - repo: Repository name used as the local checkout directory name. - url: Full VCS URL to embed in the dfetch.yaml snippet. - - Returns: - A Markdown string with a package heading, description, and dfetch - installation snippet. - - """ - version_line = f"\n tag: {manifest.version}" if manifest.version else "" - return ( - f"# {manifest.package_name}\n\n" - f"{manifest.description}\n\n" - "## Installation\n\n" - "Add to your `dfetch.yaml`:\n\n" - "```yaml\n" - "projects:\n" - f" - name: ext/{repo}\n" - f" url: {url}{version_line}\n" - "```\n\n" - "## Usage\n\n" - f"After running `dfetch update`, the library will be available at `ext/{repo}/`.\n" - ) - - -# --------------------------------------------------------------------------- -# Public API -# --------------------------------------------------------------------------- - - -def _write_detail_json( # pylint: disable=too-many-arguments,too-many-positional-arguments - data_dir: Path, - vcs_host: str, - org: str, - repo: str, - manifest: BaseManifest, - source_name: str, - label: str, - registry_path: str, -) -> None: - """Write or update the per-project detail JSON for one package. - - Args: - data_dir: Root of the catalog data directory. - vcs_host: Short VCS host label (e.g. ``"github"``). - org: Repository owner / organisation (lowercased). - repo: Repository name (lowercased). - manifest: Package manifest supplying metadata. - source_name: Internal name of the catalog source. - label: Human-readable label for the source. - registry_path: Sub-path used to build the ``index_path``. - """ - detail_path = data_dir / vcs_host / org / f"{repo}.json" - _save_json( - detail_path, - _merge_detail( - _load_json(detail_path), - manifest, - org, - repo, - source_name, - label, - registry_path, - ), - ) - - -def _process_manifest( # pylint: disable=too-many-arguments,too-many-positional-arguments - manifest: BaseManifest, - catalog: dict[str, Any], - data_dir: Path, - source_name: str, - label: str, - registry_path: str, -) -> tuple[bool, bool]: - """Update *catalog* and write the detail JSON for one manifest. - - Args: - manifest: Package manifest to process. - catalog: Mutable catalog dict to update in-place. - data_dir: Root of the catalog data directory. - source_name: Internal name of the catalog source. - label: Human-readable source label. - registry_path: Sub-path used to build the ``index_path``. - - Returns: - ``(added, updated)`` booleans — exactly one is ``True`` when the manifest - was successfully written; both are ``False`` when the manifest is skipped. - """ - if not manifest.homepage: - logger.warning("cannot determine upstream repo without a URL of %s", manifest.entry_name) - return False, False - - parsed = parse_vcs_slug(manifest.homepage) - if not parsed: - logger.warning("skipping entry without recognized VCS URL: %s", manifest.homepage) - return False, False - - host, org, repo = parsed - vcs_host = _vcs_host_label(host) - existing_entry = catalog.get(_catalog_id(vcs_host, org, repo)) - catalog[_catalog_id(vcs_host, org, repo)] = _merge_catalog_entry( - existing_entry, manifest, vcs_host, org, repo, label - ) - _write_detail_json(data_dir, vcs_host, org, repo, manifest, source_name, label, registry_path) - return existing_entry is None, existing_entry is not None - - -def write_catalog( - manifests: list[BaseManifest], - data_dir: Path, - source_name: str, - label: str, - registry_path: str, -) -> tuple[int, int]: - """Write *manifests* into catalog.json and per-project detail JSONs under *data_dir*. - - Args: - manifests: Parsed package manifests from any source strategy. - data_dir: Root of the catalog data directory. - source_name: Internal name of the source (e.g. ``"vcpkg"``). - label: Human-readable label added to each entry's ``source_labels``. - registry_path: Sub-path used to build the ``index_path`` in the detail JSON. - - Returns: - A ``(added, updated)`` tuple with the count of new and existing entries written. - - """ - catalog_path = data_dir / "catalog.json" - catalog: dict[str, Any] = _load_json(catalog_path) or {} - added = 0 - updated = 0 - - for manifest in manifests: - was_added, was_updated = _process_manifest(manifest, catalog, data_dir, source_name, label, registry_path) - added += was_added - updated += was_updated - - _save_json(catalog_path, catalog) - return added, updated +"""Catalog writer: builds the dfetch-hub package catalog from multiple sources. + +The catalog is the central index that dfetch-hub builds up from package sources +like vcpkg, Conan, and clib. Each entry represents a library that developers can +vendor into their projects. + +Developers browse the catalog to find appropriate packages, then add them to their +project's ``dfetch.yaml`` manifest. dfetch then clones the specified version +(typically a git tag) into the project's ``ext/`` directory. + +The writer produces two artifacts: +- ``catalog.json``: The main index mapping catalog IDs to library entries. +- ``//.json``: Per-project detail files with rich metadata including + available versions, installation instructions, and which sources provide the package. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from dfetch.log import get_logger + +from dfetch_hub.catalog.detail import CatalogDetail +from dfetch_hub.catalog.entry import CatalogEntry +from dfetch_hub.catalog.sources import BaseManifest, parse_vcs_slug + +if TYPE_CHECKING: + from pathlib import Path + +logger = get_logger(__name__) + + +# --------------------------------------------------------------------------- +# Catalog +# --------------------------------------------------------------------------- + + +class Catalog: + """Represents the catalog.json index of all available libraries. + + This is the main index developers browse to find packages. Each key is a catalog + ID like "github/abseil/abseil-cpp" and the value is a CatalogEntry with metadata. + """ + + def __init__(self, entries: dict[str, CatalogEntry] | None = None) -> None: + """Initialize a Catalog. + + Args: + entries: Dictionary mapping catalog IDs to entries. Defaults to empty dict. + """ + self.entries = entries or {} + + def to_dict(self) -> dict[str, Any]: + """Return a dict representation of this Catalog.""" + return {k: v.to_dict() for k, v in self.entries.items()} + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> Catalog: + """Create a Catalog from a dict. + + Args: + data: Dictionary representation of the catalog. + + Returns: + A new Catalog instance. + """ + return cls(entries={k: CatalogEntry.from_dict(v) for k, v in data.items()}) + + @classmethod + def load(cls, path: Path) -> Catalog: + """Load a Catalog from a JSON file, or return empty if it doesn't exist. + + Args: + path: Path to the catalog.json file. + + Returns: + A Catalog instance, or an empty catalog if the file doesn't exist. + """ + if not path.exists(): + return cls() + with path.open(encoding="utf-8") as fh: + return cls.from_dict(json.load(fh)) + + def dump(self, path: Path) -> None: + """Save this Catalog to a JSON file. + + Args: + path: Path to write the catalog.json file. + + Raises: + OSError: If the file cannot be written. + """ + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + json.dump(self.to_dict(), fh, indent=2, ensure_ascii=False) + fh.write("\n") + + def get_or_create_entry( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + manifest: BaseManifest, + vcs_host: str, + org: str, + repo: str, + label: str, + ) -> tuple[CatalogEntry, bool]: + """Get or create a catalog entry for this manifest. + + Args: + manifest: The package manifest containing metadata. + vcs_host: The VCS host (e.g., "github"). + org: The organization/owner. + repo: The repository name. + label: The source label. + + Returns: + A tuple of (entry, is_new) where is_new is True if entry was newly created. + """ + cat_id = CatalogEntry.catalog_id(vcs_host, org, repo, manifest.subpath) + existing = self.entries.get(cat_id) + if existing: + existing.merge_from_manifest(manifest, is_update=True, label=label) + return existing, False + + entry = CatalogEntry.from_manifest(manifest, vcs_host, org, repo, label) + self.entries[cat_id] = entry + return entry, True + + def remove_entry(self, vcs_host: str, org: str, repo: str) -> bool: + """Remove a catalog entry for a repo-root (no subpath). + + This is used when migrating from a repo-root entry to a subpath entry + for monorepo packages. + + Args: + vcs_host: The VCS host (e.g., "github"). + org: The organization/owner. + repo: The repository name. + + Returns: + True if an entry was removed, False otherwise. + """ + root_id = CatalogEntry.catalog_id(vcs_host, org, repo, None) + if root_id in self.entries: + del self.entries[root_id] + return True + return False + + +# --------------------------------------------------------------------------- +# CatalogWriter +# --------------------------------------------------------------------------- + + +class CatalogWriter: + """Handles writing package manifests to the catalog and detail JSON files. + + This is the main entry point for updating the catalog. Given a list of manifests + from a package source (vcpkg, Conan, clib, etc.), it updates both the catalog + index and individual detail files. + """ + + def __init__( + self, + data_dir: Path, + source_name: str, + label: str, + registry_path: str, + ) -> None: + """Initialize a CatalogWriter. + + Args: + data_dir: Root directory for catalog data. + source_name: Name of the package source. + label: Human-readable label for the source. + registry_path: Path within the source registry. + """ + self.data_dir = data_dir + self.source_name = source_name + self.label = label + self.registry_path = registry_path + + def write(self, manifests: list[BaseManifest]) -> tuple[int, int]: + """Write all manifests to the catalog.""" + catalog = Catalog.load(self.data_dir / "catalog.json") + added = 0 + updated = 0 + + for manifest in manifests: + was_added, was_updated = self.write_manifest(manifest, catalog) + added += was_added + updated += was_updated + + catalog.dump(self.data_dir / "catalog.json") + return added, updated + + def write_manifest(self, manifest: BaseManifest, catalog: Catalog) -> tuple[bool, bool]: + """Write a single manifest to catalog and detail files.""" + if not manifest.homepage: + logger.warning("cannot determine upstream repo without a URL of %s", manifest.entry_name) + return False, False + + parsed = parse_vcs_slug(manifest.homepage) + if not parsed: + logger.warning("skipping entry without recognized VCS URL: %s", manifest.homepage) + return False, False + + vcs_host, org, repo = parsed + vcs_host = CatalogEntry.vcs_host_label(vcs_host) + + sanitized = manifest.sanitized_subpath + if sanitized: + root_id = CatalogEntry.catalog_id(vcs_host, org, repo, None) + existing_root = catalog.entries.get(root_id) + if existing_root and self.label in existing_root.source_labels: + catalog.remove_entry(vcs_host, org, repo) + + _, is_new = catalog.get_or_create_entry(manifest, vcs_host, org, repo, self.label) + + self._write_detail(vcs_host, org, repo, manifest) + + return is_new, not is_new + + def _write_detail( + self, + vcs_host: str, + org: str, + repo: str, + manifest: BaseManifest, + ) -> None: + """Write the detail JSON for a manifest.""" + subpath = manifest.sanitized_subpath + + if subpath: + detail_path = self.data_dir / vcs_host / org / repo / f"{subpath}.json" + else: + detail_path = self.data_dir / vcs_host / org / f"{repo}.json" + + existing = CatalogDetail.load(detail_path) + if existing: + detail = existing + else: + detail = CatalogDetail.from_manifest(manifest, org, repo, self.source_name, self.label, self.registry_path) + + detail.update_from_manifest(manifest, repo, self.source_name, self.label, self.registry_path) + detail.dump(self.data_dir, vcs_host, org, repo, subpath) diff --git a/dfetch_hub/commands/update.py b/dfetch_hub/commands/update.py index fab6f94..cca31a5 100644 --- a/dfetch_hub/commands/update.py +++ b/dfetch_hub/commands/update.py @@ -1,286 +1,291 @@ -"""dfetch-hub ``update`` subcommand.""" - -from __future__ import annotations - -import argparse -import importlib.resources -import sys -import tempfile -from pathlib import Path -from typing import TYPE_CHECKING - -from dfetch.log import get_logger - -from dfetch_hub.catalog.cloner import clone_source -from dfetch_hub.catalog.sources.clib import CLibPackage, parse_packages_md -from dfetch_hub.catalog.sources.conan import parse_conan_recipe -from dfetch_hub.catalog.sources.readme import parse_readme_dir -from dfetch_hub.catalog.sources.vcpkg import parse_vcpkg_json -from dfetch_hub.catalog.writer import write_catalog -from dfetch_hub.commands import load_config_with_data_dir - -if TYPE_CHECKING: - from collections.abc import Callable - - from dfetch_hub.catalog.sources import BaseManifest - from dfetch_hub.config import SourceConfig - -logger = get_logger(__name__) - -_DEFAULT_DATA_DIR: Path = Path(str(importlib.resources.files("dfetch_hub") / "data")) - -_MANIFEST_PARSERS = { - "vcpkg.json": parse_vcpkg_json, - "conandata.yml": parse_conan_recipe, - "readme": parse_readme_dir, -} - - -def _filter_sentinel(source: SourceConfig, entry_dirs: list[Path]) -> list[Path]: - """Remove entries from *entry_dirs* that contain ``source.ignore_if_present``. - - When ``source.ignore_if_present`` is an empty string the original list is - returned unchanged. Otherwise every directory that contains a file (or - sub-directory) with that name is removed, and the number of removals is - logged at info level. - - Args: - source: Source configuration (provides ``ignore_if_present`` and ``name``). - entry_dirs: Candidate entry directories to filter. - - Returns: - Filtered list with sentinel-containing directories removed. - - """ - if not source.ignore_if_present: - return entry_dirs - before = len(entry_dirs) - filtered = [d for d in entry_dirs if not (d / source.ignore_if_present).exists()] - ignored = before - len(filtered) - if ignored: - logger.print_info_line( - source.name, - f"Ignored {ignored} folder(s) containing '{source.ignore_if_present}'", - ) - return filtered - - -def _subfolder_homepage(source: SourceConfig) -> str | None: - """Return the repository URL as a homepage for a subfolder package. - - VCS hosting providers (GitHub, GitLab, Gitea, Bitbucket, company-hosted - instances) each use different URL schemes for linking to subdirectories, so - we do not attempt to construct a provider-specific tree URL. The repository - root URL is always a valid and useful landing page. - - Args: - source: Source configuration supplying the remote URL. - - Returns: - ``source.url`` if non-empty, ``None`` otherwise. - - """ - return source.url or None - - -def _parse_entry_dirs( - entry_dirs: list[Path], - parse_fn: Callable[[Path], BaseManifest | None], - fallback_homepage: str | None, -) -> tuple[list[BaseManifest], int]: - """Parse *entry_dirs* and return ``(manifests, skipped_count)``. - - For each directory the appropriate parser is called. Packages whose - homepage is ``None`` get the *fallback_homepage* (i.e. the repository root - URL) if one is available. - - Args: - entry_dirs: Sorted list of package directories to process. - parse_fn: Parser function for the manifest type. - fallback_homepage: Repository root URL used when a manifest has no homepage. - - Returns: - A ``(manifests, skipped)`` tuple. - """ - manifests: list[BaseManifest] = [] - skipped = 0 - for entry_dir in entry_dirs: - m = parse_fn(entry_dir) - if m is None: - skipped += 1 - else: - if m.homepage is None and fallback_homepage is not None: - m.homepage = fallback_homepage - manifests.append(m) - return manifests, skipped - - -def _process_subfolders_source( - source: SourceConfig, - data_dir: Path, - limit: int | None, -) -> None: - """Handle strategy='subfolders' (vcpkg, conan-center, …). - - Dispatches to the appropriate per-directory parser based on - ``source.manifest`` (e.g. ``vcpkg.json`` → vcpkg, ``conandata.yml`` → conan). - """ - parse_fn = _MANIFEST_PARSERS.get(source.manifest) - if parse_fn is None: - if not source.manifest: - logger.warning("%s: no 'manifest' configured — skipped", source.name) - else: - logger.warning( - "%s: manifest type '%s' not supported — skipped", - source.name, - source.manifest, - ) - return - - logger.print_info_line(source.name, f"Fetching {source.url} (src: {source.path}) ...") - with tempfile.TemporaryDirectory(prefix="dfetch-hub-") as tmp: - fetched_dir = clone_source(source, Path(tmp)) - - entry_dirs = sorted(d for d in fetched_dir.iterdir() if d.is_dir()) - entry_dirs = _filter_sentinel(source, entry_dirs) - if limit is not None: - entry_dirs = entry_dirs[:limit] - - logger.print_info_line(source.name, f"Parsing {len(entry_dirs)} package(s) ...") - manifests, skipped = _parse_entry_dirs(entry_dirs, parse_fn, _subfolder_homepage(source)) - - if skipped: - logger.print_warning_line( - source.name, - f"Skipped {skipped} package(s) with no manifest", - ) - - _added, _updated = write_catalog( - manifests, - data_dir, - source_name=source.name, - label=source.label or source.name, - registry_path=source.path or source.name, - ) - logger.print_info_line( - source.name, - f"Done — {_added} added, {_updated} updated ({len(manifests) - _added - _updated} skipped/no-vcs-url)", - ) - - -def _process_git_wiki_source( - source: SourceConfig, - data_dir: Path, - limit: int | None, -) -> None: - """Handle strategy='git-wiki': clone a git wiki repo and parse a markdown index. - - The wiki is fetched via dfetch (shallow clone), then the file named by - ``source.manifest`` (e.g. ``Packages.md``) is parsed to discover packages. - For each package the upstream ``package.json`` is fetched from GitHub to - collect richer metadata. - """ - if not source.manifest: - logger.print_warning_line(source.name, "no 'manifest' configured — skipped") - return - - logger.print_info_line(source.name, f"Fetching wiki {source.url} ...") - with tempfile.TemporaryDirectory(prefix="dfetch-hub-") as tmp: - tmp_path = Path(tmp) - fetched_dir = clone_source(source, tmp_path) - - index_file = fetched_dir / source.manifest - if not index_file.exists(): - logger.print_warning_line( - source.name, - f"'{source.manifest}' not found in fetched wiki — skipped", - ) - return - - logger.print_info_line(source.name, f"Parsing {source.manifest} ...") - packages: list[CLibPackage] = parse_packages_md(index_file, limit=limit) - - logger.print_info_line(source.name, f"Fetched metadata for {len(packages)} package(s)") - _added, _updated = write_catalog( - packages, # type: ignore[arg-type] - data_dir, - source_name=source.name, - label=source.label or source.name, - registry_path=source.path or source.name, - ) - logger.print_info_line( - source.name, - f"Done — {_added} added, {_updated} updated ({len(packages) - _added - _updated} skipped/no-vcs-url)", - ) - - -def _process_source( - source: SourceConfig, - data_dir: Path, - limit: int | None, -) -> None: - if source.strategy == "subfolders": - _process_subfolders_source(source, data_dir, limit) - elif source.strategy == "git-wiki": - _process_git_wiki_source(source, data_dir, limit) - else: - logger.warning( - "%s: strategy '%s' not yet supported — skipped", - source.name, - source.strategy, - ) - - -def _cmd_update(parsed: argparse.Namespace) -> None: - """Run the catalog update pipeline.""" - config, data_dir = load_config_with_data_dir(parsed.config, parsed.data_dir, _DEFAULT_DATA_DIR) - - sources = config.sources - if parsed.source: - sources = [s for s in sources if s.name == parsed.source] - if not sources: - logger.warning("No source found with name '%s'", parsed.source) - sys.exit(1) - - for source in sources: - _process_source(source, data_dir, parsed.limit) - - -def _non_negative_int(value: str) -> int: - """Parse *value* as a non-negative integer for ``--limit``.""" - parsed = int(value) - if parsed < 0: - raise argparse.ArgumentTypeError("--limit must be >= 0") - return parsed - - -def register(subparsers: argparse._SubParsersAction) -> None: # type: ignore[type-arg] - """Register the ``update`` subcommand onto *subparsers*.""" - update_p = subparsers.add_parser( - "update", - help="Fetch sources from dfetch-hub.toml and update the catalog.", - ) - update_p.add_argument( - "--config", - default="dfetch-hub.toml", - help="Path to dfetch-hub.toml (default: %(default)s)", - ) - update_p.add_argument( - "--data-dir", - default=None, - help=f"Catalog data directory (default: catalog_path from config, else {_DEFAULT_DATA_DIR})", - ) - update_p.add_argument( - "--limit", - type=_non_negative_int, - default=None, - metavar="N", - help="Process only the first N entries per source (useful for testing)", - ) - update_p.add_argument( - "--source", - default=None, - metavar="NAME", - help="Only process the source with this name", - ) - update_p.set_defaults(func=_cmd_update) +"""dfetch-hub ``update`` subcommand.""" + +from __future__ import annotations + +import argparse +import importlib.resources +import sys +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING + +from dfetch.log import get_logger + +from dfetch_hub.catalog.cloner import clone_source +from dfetch_hub.catalog.sources.clib import CLibPackage, parse_packages_md +from dfetch_hub.catalog.sources.conan import parse_conan_recipe +from dfetch_hub.catalog.sources.readme import parse_readme_dir +from dfetch_hub.catalog.sources.vcpkg import parse_vcpkg_json +from dfetch_hub.catalog.writer import CatalogWriter +from dfetch_hub.commands import load_config_with_data_dir + +if TYPE_CHECKING: + from collections.abc import Callable + + from dfetch_hub.catalog.sources import BaseManifest + from dfetch_hub.config import SourceConfig + +logger = get_logger(__name__) + +_DEFAULT_DATA_DIR: Path = Path(str(importlib.resources.files("dfetch_hub") / "data")) + +_MANIFEST_PARSERS = { + "vcpkg.json": parse_vcpkg_json, + "conandata.yml": parse_conan_recipe, + "readme": parse_readme_dir, +} + + +def _filter_sentinel(source: SourceConfig, entry_dirs: list[Path]) -> list[Path]: + """Remove entries from *entry_dirs* that contain ``source.ignore_if_present``. + + When ``source.ignore_if_present`` is an empty string the original list is + returned unchanged. Otherwise every directory that contains a file (or + sub-directory) with that name is removed, and the number of removals is + logged at info level. + + Args: + source: Source configuration (provides ``ignore_if_present`` and ``name``). + entry_dirs: Candidate entry directories to filter. + + Returns: + Filtered list with sentinel-containing directories removed. + + """ + if not source.ignore_if_present: + return entry_dirs + before = len(entry_dirs) + filtered = [d for d in entry_dirs if not (d / source.ignore_if_present).exists()] + ignored = before - len(filtered) + if ignored: + logger.print_info_line( + source.name, + f"Ignored {ignored} folder(s) containing '{source.ignore_if_present}'", + ) + return filtered + + +def _subfolder_homepage(source: SourceConfig) -> str | None: + """Return the repository URL as a homepage for a subfolder package. + + VCS hosting providers (GitHub, GitLab, Gitea, Bitbucket, company-hosted + instances) each use different URL schemes for linking to subdirectories, so + we do not attempt to construct a provider-specific tree URL. The repository + root URL is always a valid and useful landing page. + + Args: + source: Source configuration supplying the remote URL. + + Returns: + ``source.url`` if non-empty, ``None`` otherwise. + + """ + return source.url or None + + +def _parse_entry_dirs( + entry_dirs: list[Path], + parse_fn: Callable[[Path], BaseManifest | None], + fallback_homepage: str | None, +) -> tuple[list[BaseManifest], int]: + """Parse *entry_dirs* and return ``(manifests, skipped_count)``. + + For each directory the appropriate parser is called. Packages whose + homepage is ``None`` get the *fallback_homepage* (i.e. the repository root + URL) if one is available. + + Args: + entry_dirs: Sorted list of package directories to process. + parse_fn: Parser function for the manifest type. + fallback_homepage: Repository root URL used when a manifest has no homepage. + + Returns: + A ``(manifests, skipped)`` tuple. + """ + manifests: list[BaseManifest] = [] + skipped = 0 + for entry_dir in entry_dirs: + m = parse_fn(entry_dir) + if m is None: + skipped += 1 + else: + if m.homepage is None and fallback_homepage is not None: + m.homepage = fallback_homepage + if m.in_project_repo and not m.subpath: + m.subpath = entry_dir.name + manifests.append(m) + return manifests, skipped + + +def _process_subfolders_source( + source: SourceConfig, + data_dir: Path, + limit: int | None, +) -> None: + """Handle strategy='subfolders' (vcpkg, conan-center, …). + + Dispatches to the appropriate per-directory parser based on + ``source.manifest`` (e.g. ``vcpkg.json`` → vcpkg, ``conandata.yml`` → conan). + """ + parse_fn = _MANIFEST_PARSERS.get(source.manifest) + if parse_fn is None: + if not source.manifest: + logger.warning("%s: no 'manifest' configured — skipped", source.name) + else: + logger.warning( + "%s: manifest type '%s' not supported — skipped", + source.name, + source.manifest, + ) + return + + logger.print_info_line(source.name, f"Fetching {source.url} (src: {source.path}) ...") + with tempfile.TemporaryDirectory(prefix="dfetch-hub-") as tmp: + fetched_dir = clone_source(source, Path(tmp)) + + entry_dirs = sorted(d for d in fetched_dir.iterdir() if d.is_dir()) + entry_dirs = _filter_sentinel(source, entry_dirs) + if limit is not None: + entry_dirs = entry_dirs[:limit] + + logger.print_info_line(source.name, f"Parsing {len(entry_dirs)} package(s) ...") + manifests, skipped = _parse_entry_dirs(entry_dirs, parse_fn, _subfolder_homepage(source)) + + if skipped: + logger.print_warning_line( + source.name, + f"Skipped {skipped} package(s) with no manifest", + ) + + writer = CatalogWriter( + data_dir, + source.name, + source.label or source.name, + source.path or source.name, + ) + _added, _updated = writer.write(manifests) + logger.print_info_line( + source.name, + f"Done — {_added} added, {_updated} updated ({len(manifests) - _added - _updated} skipped/no-vcs-url)", + ) + + +def _process_git_wiki_source( + source: SourceConfig, + data_dir: Path, + limit: int | None, +) -> None: + """Handle strategy='git-wiki': clone a git wiki repo and parse a markdown index. + + The wiki is fetched via dfetch (shallow clone), then the file named by + ``source.manifest`` (e.g. ``Packages.md``) is parsed to discover packages. + For each package the upstream ``package.json`` is fetched from GitHub to + collect richer metadata. + """ + if not source.manifest: + logger.print_warning_line(source.name, "no 'manifest' configured — skipped") + return + + logger.print_info_line(source.name, f"Fetching wiki {source.url} ...") + with tempfile.TemporaryDirectory(prefix="dfetch-hub-") as tmp: + tmp_path = Path(tmp) + fetched_dir = clone_source(source, tmp_path) + + index_file = fetched_dir / source.manifest + if not index_file.exists(): + logger.print_warning_line( + source.name, + f"'{source.manifest}' not found in fetched wiki — skipped", + ) + return + + logger.print_info_line(source.name, f"Parsing {source.manifest} ...") + packages: list[CLibPackage] = parse_packages_md(index_file, limit=limit) + + logger.print_info_line(source.name, f"Fetched metadata for {len(packages)} package(s)") + writer = CatalogWriter( + data_dir, + source.name, + source.label or source.name, + source.path or source.name, + ) + _added, _updated = writer.write(packages) # type: ignore[arg-type] + logger.print_info_line( + source.name, + f"Done — {_added} added, {_updated} updated ({len(packages) - _added - _updated} skipped/no-vcs-url)", + ) + + +def _process_source( + source: SourceConfig, + data_dir: Path, + limit: int | None, +) -> None: + if source.strategy == "subfolders": + _process_subfolders_source(source, data_dir, limit) + elif source.strategy == "git-wiki": + _process_git_wiki_source(source, data_dir, limit) + else: + logger.warning( + "%s: strategy '%s' not yet supported — skipped", + source.name, + source.strategy, + ) + + +def _cmd_update(parsed: argparse.Namespace) -> None: + """Run the catalog update pipeline.""" + config, data_dir = load_config_with_data_dir(parsed.config, parsed.data_dir, _DEFAULT_DATA_DIR) + + sources = config.sources + if parsed.source: + sources = [s for s in sources if s.name == parsed.source] + if not sources: + logger.warning("No source found with name '%s'", parsed.source) + sys.exit(1) + + for source in sources: + _process_source(source, data_dir, parsed.limit) + + +_LIMIT_ERROR_MSG = "--limit must be >= 0" + + +def _non_negative_int(value: str) -> int: + """Parse *value* as a non-negative integer for ``--limit``.""" + parsed = int(value) + if parsed < 0: + raise argparse.ArgumentTypeError(_LIMIT_ERROR_MSG) + return parsed + + +def register(subparsers: argparse._SubParsersAction) -> None: # type: ignore[type-arg] + """Register the ``update`` subcommand onto *subparsers*.""" + update_p = subparsers.add_parser( + "update", + help="Fetch sources from dfetch-hub.toml and update the catalog.", + ) + update_p.add_argument( + "--config", + default="dfetch-hub.toml", + help="Path to dfetch-hub.toml (default: %(default)s)", + ) + update_p.add_argument( + "--data-dir", + default=None, + help=f"Catalog data directory (default: catalog_path from config, else {_DEFAULT_DATA_DIR})", + ) + update_p.add_argument( + "--limit", + type=_non_negative_int, + default=None, + metavar="N", + help="Process only the first N entries per source (useful for testing)", + ) + update_p.add_argument( + "--source", + default=None, + metavar="NAME", + help="Only process the source with this name", + ) + update_p.set_defaults(func=_cmd_update) diff --git a/dfetch_hub/data/catalog.json b/dfetch_hub/data/catalog.json index 62f88da..6f51c41 100644 --- a/dfetch_hub/data/catalog.json +++ b/dfetch_hub/data/catalog.json @@ -784,5 +784,60 @@ "date": null } ] + }, + "github/faburaya/3fd": { + "id": "github/faburaya/3fd", + "name": "3fd", + "description": "C++ Framework For Fast Development", + "url": "https://github.com/faburaya/3fd", + "source_type": "github", + "default_branch": "main", + "license": null, + "topics": [], + "stars": 0, + "last_updated": "2026-03-07T21:34:13.599616+00:00", + "source_labels": [ + "vcpkg" + ], + "tags": [ + { + "name": "2.6.3", + "is_tag": true, + "commit_sha": null, + "date": null + } + ] + }, + "github/bakeruk/modern-typescript-monorepo-example/typescript-example-1": { + "id": "github/bakeruk/modern-typescript-monorepo-example/typescript-example-1", + "name": "typescript-example-1", + "description": "The first typescript example for the Monorepo example", + "url": "https://github.com/bakeruk/modern-typescript-monorepo-example", + "source_type": "github", + "default_branch": "main", + "license": null, + "topics": [], + "stars": 0, + "last_updated": "2026-03-07T21:34:24.325431+00:00", + "source_labels": [ + "ts-monorepo" + ], + "tags": [] + }, + "github/bakeruk/modern-typescript-monorepo-example/typescript-example-2": { + "id": "github/bakeruk/modern-typescript-monorepo-example/typescript-example-2", + "name": "typescript-example-2", + "description": "The second typescript example for the Monorepo example", + "url": "https://github.com/bakeruk/modern-typescript-monorepo-example", + "source_type": "github", + "default_branch": "main", + "license": null, + "topics": [], + "stars": 0, + "last_updated": "2026-03-07T21:34:24.622203+00:00", + "source_labels": [ + "ts-monorepo" + ], + "tags": [] } } diff --git a/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-1.json b/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-1.json new file mode 100644 index 0000000..272260f --- /dev/null +++ b/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-1.json @@ -0,0 +1,28 @@ +{ + "canonical_url": "https://github.com/bakeruk/modern-typescript-monorepo-example", + "org": "bakeruk", + "repo": "modern-typescript-monorepo-example", + "subfolder_path": "typescript-example-1", + "catalog_sources": [ + { + "source_name": "ts-monorepo", + "label": "ts-monorepo", + "index_path": "packages/typescript-example-1", + "registry_version": null + } + ], + "manifests": [], + "readme": "# Typescript example #1\n\nThe first typescript example for the Monorepo example\n\n## License\n\nMIT License\n\nCopyright (c) 2023 Luke Baker\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n", + "tags": [], + "branches": [ + { + "name": "main", + "is_tag": false, + "commit_sha": null, + "date": null + } + ], + "urls": {}, + "license_text": null, + "fetched_at": "2026-03-07T21:07:55.629467+00:00" +} diff --git a/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-2.json b/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-2.json new file mode 100644 index 0000000..fce72ee --- /dev/null +++ b/dfetch_hub/data/github/bakeruk/modern-typescript-monorepo-example/typescript-example-2.json @@ -0,0 +1,28 @@ +{ + "canonical_url": "https://github.com/bakeruk/modern-typescript-monorepo-example", + "org": "bakeruk", + "repo": "modern-typescript-monorepo-example", + "subfolder_path": "typescript-example-2", + "catalog_sources": [ + { + "source_name": "ts-monorepo", + "label": "ts-monorepo", + "index_path": "packages/typescript-example-2", + "registry_version": null + } + ], + "manifests": [], + "readme": "# Typescript example #2\n\nThe second typescript example for the Monorepo example\n\n## License\n\nMIT License\n\nCopyright (c) 2023 Luke Baker\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n", + "tags": [], + "branches": [ + { + "name": "main", + "is_tag": false, + "commit_sha": null, + "date": null + } + ], + "urls": {}, + "license_text": null, + "fetched_at": "2026-03-07T21:07:55.912859+00:00" +} diff --git a/dfetch_hub/data/github/faburaya/3fd.json b/dfetch_hub/data/github/faburaya/3fd.json new file mode 100644 index 0000000..e0f29c7 --- /dev/null +++ b/dfetch_hub/data/github/faburaya/3fd.json @@ -0,0 +1,109 @@ +{ + "canonical_url": "https://github.com/faburaya/3fd", + "org": "faburaya", + "repo": "3fd", + "subfolder_path": null, + "catalog_sources": [ + { + "source_name": "vcpkg", + "label": "vcpkg", + "index_path": "ports/3fd", + "registry_version": "2.6.3" + } + ], + "manifests": [], + "readme": "A framework to help C++ programmers to code faster more robust and stable code.\n\nThe scope of this project is to provide a modern framework that simplifies common tasks like error handling and logging, ISAM/relational data access and storage, heterogeneous programming and memory management. It was carefully engineered to provide high performance, low memory footprint, verbosity in error report, a simple interface compliant to the new C++11 standard and portability to several platforms: POSIX, Windows Vista/7/8/8.1/10/10 UWP.\n\nThe provided features are:\n\n+ A fully capable and flexible garbage collector that performs fast memory reclaim\n\n+ A solid error and exception handling structure with stack tracing and logging with asynchronous IO (used by all the framework modules)\n\n+ An OOP+RAII model for manipulation of relational data built on top of the latest stable releases of SQLite (the most widely deployed SQL embedded database engine in the world), with support for transactions, concurrent access and robust error handling\n\n+ An OOP+RAII model for simplified heterogeneous programming that makes OpenCL v1.2 more practical and robust to employ\n\n+ A module for ISAM data access backed by Microsoft ESE, but wrapped with real OOP+RAII design, which allows a much easier way to define, search and intersect multiple indexes\n\n+ A module for SOAP web services containing helpers & wrappers around Microsoft WWS API, which enables quick development of native web services hosts and clients, including support for SSL transport security\n\n+ A module with queue reader & writer for service broker of Microsoft SQL Server, that greatly simplifies its deployment and use\n", + "tags": [ + { + "name": "v2.3.2", + "is_tag": true, + "commit_sha": "fae548786bff960954a579663ca0de0f55328629", + "date": null + }, + { + "name": "v2.3.2-RC1", + "is_tag": true, + "commit_sha": "fae548786bff960954a579663ca0de0f55328629", + "date": null + }, + { + "name": "v2.3.3", + "is_tag": true, + "commit_sha": "74034699aa9b7d9d19b1906bc11cd2e203df4620", + "date": null + }, + { + "name": "v2.4.1", + "is_tag": true, + "commit_sha": "2308e05d9335c02ffee6d2f326a2763d25987408", + "date": null + }, + { + "name": "v2.4.2", + "is_tag": true, + "commit_sha": "a66eff413195ba323a57291e30216ed73ee71992", + "date": null + }, + { + "name": "v2.4.3", + "is_tag": true, + "commit_sha": "c9e43151dc1223c85bf3d436658dcd5f9e8d8f73", + "date": null + }, + { + "name": "v2.5.2", + "is_tag": true, + "commit_sha": "694a4b43aafd539ceb428e5a905aafbbe0534816", + "date": null + }, + { + "name": "v2.5.3", + "is_tag": true, + "commit_sha": "233fc10f9b2d182e40430b094a7aa75158795f60", + "date": null + }, + { + "name": "v2.6.2", + "is_tag": true, + "commit_sha": "650eb11174c277d373601c61655e5f871fa01d2c", + "date": null + }, + { + "name": "v2.6.3", + "is_tag": true, + "commit_sha": "3a0fe606268721d1560b88dcca8647c67c0b275c", + "date": null + }, + { + "name": "v2.7.1", + "is_tag": true, + "commit_sha": "edb14c290b7d4e4cf3cd4de787cd94b3e63e8e86", + "date": null + }, + { + "name": "v2.7.2", + "is_tag": true, + "commit_sha": "2fa0b57b9702902178c878fe3d317d55ec4cdc95", + "date": null + }, + { + "name": "v2.8.0", + "is_tag": true, + "commit_sha": "cde5d33a2e22dc6a02423334b0a8b766348cb296", + "date": null + } + ], + "branches": [ + { + "name": "main", + "is_tag": false, + "commit_sha": null, + "date": null + } + ], + "urls": { + "Homepage": "https://github.com/faburaya/3fd" + }, + "license_text": null, + "fetched_at": "2026-03-07T21:34:13.599723+00:00" +} diff --git a/test/test_catalog_clib.py b/test/test_catalog_clib.py index ee5379c..8682207 100644 --- a/test/test_catalog_clib.py +++ b/test/test_catalog_clib.py @@ -100,6 +100,7 @@ def test_build_package_uses_vcs_url_when_no_homepage_in_json() -> None: ) assert pkg.homepage == "https://github.com/clibs/buffer" + assert pkg.in_project_repo is True def test_build_package_uses_json_homepage_as_canonical_url() -> None: diff --git a/test/test_catalog_conan.py b/test/test_catalog_conan.py index 9f6c776..997c2e6 100644 --- a/test/test_catalog_conan.py +++ b/test/test_catalog_conan.py @@ -168,6 +168,7 @@ def test_parse_conan_recipe_basic(recipe_dir: Path) -> None: assert m.license == "Apache-2.0" assert "algorithm" in m.topics assert "google" in m.topics + assert m.in_project_repo is False def test_parse_conan_recipe_latest_version(recipe_dir: Path) -> None: diff --git a/test/test_catalog_detail.py b/test/test_catalog_detail.py new file mode 100644 index 0000000..5666b31 --- /dev/null +++ b/test/test_catalog_detail.py @@ -0,0 +1,260 @@ +"""Tests for dfetch_hub.catalog.detail: CatalogDetail.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import patch + +import pytest + +from dfetch_hub.catalog.detail import CatalogDetail +from dfetch_hub.catalog.model import CatalogSource + +if TYPE_CHECKING: + pass + + +def _manifest( + entry_name: str = "abseil", + package_name: str = "abseil-cpp", + description: str = "Abseil C++ libraries from Google", + homepage: str | None = "https://github.com/abseil/abseil-cpp", + license_: str | None = "Apache-2.0", + version: str | None = "20240116.2", + subpath: str | None = None, +): + from dfetch_hub.catalog.sources import BaseManifest + + return BaseManifest( + entry_name=entry_name, + package_name=package_name, + description=description, + homepage=homepage, + license=license_, + version=version, + subpath=subpath, + ) + + +def test_catalog_detail_from_manifest() -> None: + """from_manifest creates a proper detail.""" + detail = CatalogDetail.from_manifest( + _manifest(), + "abseil", + "abseil-cpp", + "vcpkg", + "vcpkg", + "ports", + ) + assert detail.org == "abseil" + assert detail.repo == "abseil-cpp" + assert len(detail.catalog_sources) == 1 + assert detail.catalog_sources[0].source_name == "vcpkg" + + +def test_catalog_detail_from_dict_roundtrip() -> None: + """from_dict + to_dict preserves data.""" + original = CatalogDetail.from_manifest( + _manifest(), + "org", + "repo", + "source", + "label", + "path", + ) + restored = CatalogDetail.from_dict(original.to_dict()) + assert restored.org == original.org + assert restored.repo == original.repo + + +def test_catalog_detail_add_source_updates_existing() -> None: + """add_source updates existing source.""" + from dfetch_hub.catalog.model import VCSLocation + + detail = CatalogDetail( + location=VCSLocation(host="", org="org", repo="repo"), + catalog_sources=[CatalogSource(source_name="src1", label="label1", index_path="path1")], + ) + detail.add_source(_manifest(version="2.0"), "src1", "label1", "newpath") + assert len(detail.catalog_sources) == 1 + assert detail.catalog_sources[0].registry_version == "2.0" + + +def test_catalog_detail_add_source_appends_new() -> None: + """add_source appends new source.""" + from dfetch_hub.catalog.model import VCSLocation + + detail = CatalogDetail( + location=VCSLocation(host="", org="org", repo="repo"), + catalog_sources=[CatalogSource(source_name="src1", label="label1", index_path="path1")], + ) + detail.add_source(_manifest(), "src2", "label2", "path2") + assert len(detail.catalog_sources) == 2 + + +def test_catalog_detail_readme_content_from_manifest() -> None: + """readme_content on manifest replaces generated readme.""" + from dfetch_hub.catalog.sources.clib import CLibPackage + + m = CLibPackage( + entry_name="clibs/buffer", + package_name="buffer", + description="Tiny C buffer library", + homepage="https://github.com/clibs/buffer", + license="MIT", + version="0.4.0", + readme_content="# Real README from upstream", + ) + detail = CatalogDetail.from_manifest(m, "clibs", "buffer", "clib", "clib", "clib") + assert detail.readme == "# Real README from upstream" + + +def test_catalog_detail_urls_from_manifest() -> None: + """urls from manifest are written to detail.""" + from dfetch_hub.catalog.sources import BaseManifest + + m = BaseManifest( + entry_name="abseil", + package_name="abseil-cpp", + description="desc", + homepage="https://github.com/abseil/abseil-cpp", + license=None, + version=None, + urls={"Homepage": "https://github.com/abseil/abseil-cpp", "Source": "https://github.com/x/y"}, + ) + detail = CatalogDetail.from_manifest(m, "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") + detail.update_from_manifest(m, "abseil-cpp", "vcpkg", "vcpkg", "ports") + assert detail.urls["Homepage"] == "https://github.com/abseil/abseil-cpp" + assert detail.urls["Source"] == "https://github.com/x/y" + + +_FULL_SHA = "a" * 40 + + +def test_fetch_upstream_tags_returns_tag_entries() -> None: + """Tags are extracted from refs/tags/* entries returned by ls-remote.""" + ls_remote = { + "refs/tags/v1.0.0": _FULL_SHA, + "refs/tags/v2.0.0": "b" * 40, + "refs/heads/main": "c" * 40, + } + with patch("dfetch_hub.catalog.detail.GitRemote._ls_remote", return_value=ls_remote): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + tag_names = {t.name for t in tags} + assert tag_names == {"v1.0.0", "v2.0.0"} + + +def test_fetch_upstream_tags_excludes_branch_refs() -> None: + """Entries under refs/heads/ are not returned as tags.""" + ls_remote = { + "refs/heads/main": _FULL_SHA, + "refs/heads/dev": "b" * 40, + } + with patch("dfetch_hub.catalog.detail.GitRemote._ls_remote", return_value=ls_remote): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + assert tags == [] + + +def test_fetch_upstream_tags_commit_sha_is_full_length() -> None: + """commit_sha is the full 40-character SHA.""" + ls_remote = {"refs/tags/v1.0.0": _FULL_SHA} + with patch("dfetch_hub.catalog.detail.GitRemote._ls_remote", return_value=ls_remote): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + assert len(tags) == 1 + assert tags[0].commit_sha == _FULL_SHA + + +def test_fetch_upstream_tags_is_tag_true() -> None: + """Every entry has is_tag set to True.""" + ls_remote = {"refs/tags/v1.0.0": _FULL_SHA} + with patch("dfetch_hub.catalog.detail.GitRemote._ls_remote", return_value=ls_remote): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + assert tags[0].is_tag is True + + +def test_fetch_upstream_tags_name_strips_refs_prefix() -> None: + """The 'refs/tags/' prefix is stripped from the tag name.""" + ls_remote = {"refs/tags/release-2024": _FULL_SHA} + with patch("dfetch_hub.catalog.detail.GitRemote._ls_remote", return_value=ls_remote): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + assert tags[0].name == "release-2024" + + +def test_fetch_upstream_tags_returns_empty_on_error() -> None: + """Returns an empty list when ls-remote raises an exception.""" + with patch( + "dfetch_hub.catalog.detail.GitRemote._ls_remote", + side_effect=RuntimeError("network error"), + ): + tags = CatalogDetail.fetch_upstream_tags("https://github.com/owner/repo") + + assert tags == [] + + +def test_generate_readme_contains_package_name() -> None: + """Package name appears in the generated README heading.""" + m = _manifest() + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "abseil-cpp" in readme + + +def test_generate_readme_contains_description() -> None: + """Package description appears in the generated README.""" + m = _manifest() + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "Abseil C++ libraries" in readme + + +def test_generate_readme_contains_version_tag() -> None: + """Version tag appears in the dfetch.yaml snippet.""" + m = _manifest(version="20240116.2") + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "20240116.2" in readme + + +def test_generate_readme_omits_tag_when_no_version() -> None: + """No 'tag:' line is emitted when version is None.""" + m = _manifest(version=None) + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "tag:" not in readme + + +def test_generate_readme_contains_dfetch_yaml_snippet() -> None: + """The generated README contains a dfetch.yaml code block.""" + m = _manifest() + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "dfetch.yaml" in readme + + +def test_generate_readme_uses_provided_url() -> None: + """The URL passed in appears verbatim in the dfetch.yaml snippet.""" + m = _manifest() + readme = CatalogDetail.generate_readme(m, "myrepo", "https://gitlab.com/myorg/myrepo") + assert "https://gitlab.com/myorg/myrepo" in readme + + +def test_generate_readme_monorepo_includes_src_line() -> None: + """Monorepo components include a 'src:' line with the subpath.""" + m = _manifest(subpath="mylib") + readme = CatalogDetail.generate_readme(m, "mymonorepo", "https://github.com/org/mymonorepo") + assert "src: mylib" in readme + + +def test_generate_readme_monorepo_uses_subpath_as_local_name() -> None: + """The local checkout name (ext/) uses subpath, not the repo name.""" + m = _manifest(subpath="mylib") + readme = CatalogDetail.generate_readme(m, "mymonorepo", "https://github.com/org/mymonorepo") + assert "ext/mylib" in readme + assert "ext/mymonorepo" not in readme + + +def test_generate_readme_no_subpath_no_src_line() -> None: + """Packages without a subpath do not emit a 'src:' line.""" + m = _manifest() + readme = CatalogDetail.generate_readme(m, "abseil-cpp", "https://github.com/abseil/abseil-cpp") + assert "src:" not in readme diff --git a/test/test_catalog_entry.py b/test/test_catalog_entry.py new file mode 100644 index 0000000..8c9399b --- /dev/null +++ b/test/test_catalog_entry.py @@ -0,0 +1,134 @@ +"""Tests for dfetch_hub.catalog.entry: CatalogEntry.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + pass + +from dfetch_hub.catalog.entry import CatalogEntry +from dfetch_hub.catalog.model import Tag + + +def _manifest( + entry_name: str = "abseil", + package_name: str = "abseil-cpp", + description: str = "Abseil C++ libraries from Google", + homepage: str | None = "https://github.com/abseil/abseil-cpp", + license_: str | None = "Apache-2.0", + version: str | None = "20240116.2", + subpath: str | None = None, +): + from dfetch_hub.catalog.sources import BaseManifest + + return BaseManifest( + entry_name=entry_name, + package_name=package_name, + description=description, + homepage=homepage, + license=license_, + version=version, + subpath=subpath, + ) + + +def test_catalog_entry_from_manifest() -> None: + """from_manifest creates a proper entry.""" + entry = CatalogEntry.from_manifest( + _manifest(), + "github", + "abseil", + "abseil-cpp", + "vcpkg", + ) + assert entry.id == "github/abseil/abseil-cpp" + assert entry.name == "abseil-cpp" + assert entry.description == "Abseil C++ libraries from Google" + assert entry.license == "Apache-2.0" + assert entry.source_type == "github" + assert "vcpkg" in entry.source_labels + assert any(t.name == "20240116.2" for t in entry.tags) + + +def test_catalog_entry_from_dict_roundtrip() -> None: + """from_dict + to_dict preserves data.""" + original = CatalogEntry.from_manifest( + _manifest(version="1.0.0"), + "github", + "org", + "repo", + "label", + ) + restored = CatalogEntry.from_dict(original.to_dict()) + assert restored.id == original.id + assert restored.name == original.name + + +def test_catalog_entry_merge_backfills_missing() -> None: + """merge_from_manifest backfills missing description and license.""" + entry = CatalogEntry(cat_id="test", name="test", description=None, license_str=None) + entry.merge_from_manifest(_manifest(), is_update=True, label="new") + assert entry.description == "Abseil C++ libraries from Google" + assert entry.license == "Apache-2.0" + + +def test_catalog_entry_merge_preserves_existing() -> None: + """merge_from_manifest does not overwrite existing description/license.""" + entry = CatalogEntry(cat_id="test", name="test", description="old", license_str="MIT") + entry.merge_from_manifest(_manifest(), is_update=True, label="new") + assert entry.description == "old" + assert entry.license == "MIT" + + +def test_catalog_entry_merge_adds_labels() -> None: + """merge_from_manifest adds new labels without duplicating.""" + entry = CatalogEntry(cat_id="test", name="test", source_labels=["conan"]) + entry.merge_from_manifest(_manifest(), is_update=True, label="vcpkg") + assert "conan" in entry.source_labels + assert "vcpkg" in entry.source_labels + assert entry.source_labels.count("vcpkg") == 1 + + +def test_catalog_entry_update_tags_no_duplicate() -> None: + """update_tags doesn't add duplicate tags (with or without 'v' prefix).""" + entry = CatalogEntry(cat_id="test", name="test", tags=[Tag(name="v1.0.0", is_tag=True)]) + entry.update_tags("1.0.0") + assert sum(1 for t in entry.tags if t.name.lstrip("v") == "1.0.0") == 1 + + +def test_catalog_entry_update_tags_no_duplicate_version() -> None: + """update_tags doesn't add same version twice.""" + entry = CatalogEntry(cat_id="test", name="test", tags=[Tag(name="20240116.2", is_tag=True)]) + entry.update_tags("20240116.2") + assert sum(1 for t in entry.tags if t.name == "20240116.2") == 1 + + +def test_catalog_id_format() -> None: + """Catalog ID format is vcs_host/org/repo.""" + assert CatalogEntry.catalog_id("github", "abseil", "abseil-cpp") == "github/abseil/abseil-cpp" + + +def test_catalog_id_lowercases_inputs() -> None: + """All components are lowercased.""" + assert CatalogEntry.catalog_id("GITHUB", "Abseil", "Abseil-CPP") == "github/abseil/abseil-cpp" + + +def test_catalog_id_with_subpath() -> None: + """Monorepo component includes the subpath segment.""" + assert CatalogEntry.catalog_id("github", "org", "repo", "mylib") == "github/org/repo/mylib" + + +def test_vcs_host_label_github() -> None: + """Maps github.com to github.""" + assert CatalogEntry.vcs_host_label("github.com") == "github" + + +def test_vcs_host_label_gitlab() -> None: + """Maps gitlab.com to gitlab.""" + assert CatalogEntry.vcs_host_label("gitlab.com") == "gitlab" + + +def test_vcs_host_label_unknown() -> None: + """Unknown hosts are returned as-is.""" + assert CatalogEntry.vcs_host_label("gitea.example.com") == "gitea.example.com" diff --git a/test/test_catalog_model.py b/test/test_catalog_model.py new file mode 100644 index 0000000..e8cbce3 --- /dev/null +++ b/test/test_catalog_model.py @@ -0,0 +1,68 @@ +"""Tests for dfetch_hub.catalog.model: Tag, CatalogSource, and VCSLocation.""" + +from __future__ import annotations + +from dfetch_hub.catalog.model import CatalogSource, Tag, VCSLocation + + +def test_vcs_location_catalog_id() -> None: + """VCSLocation produces correct catalog ID.""" + loc = VCSLocation("github", "abseil", "abseil-cpp") + assert loc.catalog_id == "github/abseil/abseil-cpp" + + +def test_vcs_location_catalog_id_with_subpath() -> None: + """VCSLocation includes subpath in catalog ID.""" + loc = VCSLocation("github", "org", "repo", "mylib") + assert loc.catalog_id == "github/org/repo/mylib" + + +def test_vcs_location_catalog_id_lowercases() -> None: + """VCSLocation lowercases all components.""" + loc = VCSLocation("GITHUB", "Abseil", "Abseil-CPP") + assert loc.catalog_id == "github/abseil/abseil-cpp" + + +def test_vcs_location_catalog_id_subpath_lowercases() -> None: + """VCSLocation lowercases subpath.""" + loc = VCSLocation("github", "org", "repo", "MyLib") + assert loc.catalog_id == "github/org/repo/mylib" + + +def test_tag_to_dict_roundtrip() -> None: + """to_dict + from_dict preserves data.""" + original = Tag(name="v1.0.0", is_tag=True, commit_sha="abc123", date="2024-01-01") + restored = Tag.from_dict(original.to_dict()) + assert restored.name == original.name + assert restored.is_tag == original.is_tag + + +def test_tag_from_dict_with_defaults() -> None: + """from_dict handles missing keys with defaults.""" + tag = Tag.from_dict({}) + assert tag.name == "" + assert tag.is_tag is True + + +def test_catalog_source_to_dict_roundtrip() -> None: + """to_dict + from_dict preserves data.""" + original = CatalogSource( + source_name="vcpkg", + label="vcpkg", + index_path="ports/abseil", + registry_version="1.0.0", + ) + restored = CatalogSource.from_dict(original.to_dict()) + assert restored.source_name == original.source_name + assert restored.label == original.label + assert restored.index_path == original.index_path + assert restored.registry_version == original.registry_version + + +def test_catalog_source_from_dict_with_defaults() -> None: + """from_dict handles missing keys with defaults.""" + source = CatalogSource.from_dict({}) + assert source.source_name == "" + assert source.label == "" + assert source.index_path == "" + assert source.registry_version is None diff --git a/test/test_catalog_readme.py b/test/test_catalog_readme.py index d52bf1a..f441349 100644 --- a/test/test_catalog_readme.py +++ b/test/test_catalog_readme.py @@ -142,6 +142,7 @@ def test_parses_readme_md(self, tmp_path: Path) -> None: assert result.version is None assert result.readme_content is not None assert "A great package." in result.readme_content + assert result.in_project_repo is True def test_parses_readme_rst(self, tmp_path: Path) -> None: """Falls back to README.rst when README.md is absent.""" diff --git a/test/test_catalog_sources.py b/test/test_catalog_sources.py index 1c7f2f8..2a07639 100644 --- a/test/test_catalog_sources.py +++ b/test/test_catalog_sources.py @@ -31,6 +31,19 @@ def test_base_manifest_urls_defaults_to_empty_dict() -> None: assert m.urls == {} +def test_base_manifest_in_project_repo_defaults_to_false() -> None: + """in_project_repo defaults to False — manifests are external registry entries by default.""" + m = BaseManifest( + entry_name="pkg", + package_name="pkg", + description="desc", + homepage=None, + license=None, + version=None, + ) + assert m.in_project_repo is False + + def test_base_manifest_urls_accepts_populated_dict() -> None: """urls field stores the supplied mapping unchanged.""" m = BaseManifest( @@ -174,3 +187,72 @@ def test_fetch_readme_for_homepage_delegates_to_fetch_readme_for_github() -> Non mock_fn.assert_called_once_with("myorg", "myrepo") assert result == "# content" + + +# --------------------------------------------------------------------------- +# BaseManifest.sanitize_subpath +# --------------------------------------------------------------------------- + + +def test_sanitize_subpath_returns_none_for_none() -> None: + """None input returns None.""" + assert BaseManifest.sanitize_subpath(None) is None + + +def test_sanitize_subpath_returns_none_for_empty_string() -> None: + """Empty string returns None.""" + assert BaseManifest.sanitize_subpath("") is None + + +def test_sanitize_subpath_returns_clean_path_unchanged() -> None: + """A valid forward-slash path is returned as-is.""" + assert BaseManifest.sanitize_subpath("ports/zlib") == "ports/zlib" + + +def test_sanitize_subpath_strips_leading_slash() -> None: + """A leading forward slash is stripped.""" + assert BaseManifest.sanitize_subpath("/foo") == "foo" + + +def test_sanitize_subpath_strips_trailing_slash() -> None: + """A trailing forward slash is stripped.""" + assert BaseManifest.sanitize_subpath("foo/") == "foo" + + +def test_sanitize_subpath_normalizes_backslash_to_forward_slash() -> None: + """Backslash separators are normalized to forward slashes and returned.""" + assert BaseManifest.sanitize_subpath("foo\\bar") == "foo/bar" + + +def test_sanitize_subpath_strips_leading_backslash() -> None: + """A leading backslash is treated as a path separator and stripped.""" + assert BaseManifest.sanitize_subpath("\\foo") == "foo" + + +def test_sanitize_subpath_rejects_dotdot_after_backslash_normalization() -> None: + """Backslash-encoded path traversal is blocked once backslashes are normalized.""" + assert BaseManifest.sanitize_subpath("foo\\..\\bar") is None + + +def test_sanitize_subpath_rejects_leading_dot() -> None: + """Paths starting with '.' are rejected.""" + assert BaseManifest.sanitize_subpath(".hidden") is None + + +def test_sanitize_subpath_rejects_dotdot_segment() -> None: + """Paths containing '..' segments are rejected.""" + assert BaseManifest.sanitize_subpath("foo/../bar") is None + + +def test_sanitize_subpath_property_returns_normalized_backslash_path() -> None: + """sanitized_subpath property on a manifest returns the backslash-normalized value.""" + m = BaseManifest( + entry_name="pkg", + package_name="pkg", + description="", + homepage=None, + license=None, + version=None, + subpath="foo\\bar", + ) + assert m.sanitized_subpath == "foo/bar" diff --git a/test/test_catalog_vcpkg.py b/test/test_catalog_vcpkg.py index d7f9aa3..bd6425d 100644 --- a/test/test_catalog_vcpkg.py +++ b/test/test_catalog_vcpkg.py @@ -11,6 +11,7 @@ import pytest +from dfetch_hub.catalog.sources.vcpkg import _github_url_from_portfile # noqa: PLC2701 from dfetch_hub.catalog.sources.vcpkg import ( VcpkgManifest, _extract_dependencies, @@ -135,6 +136,7 @@ def test_parse_vcpkg_json_basic_fields(tmp_path: Path) -> None: assert result.homepage == "https://github.com/abseil/abseil-cpp" assert result.license == "Apache-2.0" assert result.entry_name == "abseil" + assert result.in_project_repo is False def test_parse_vcpkg_json_dependencies(tmp_path: Path) -> None: @@ -264,7 +266,7 @@ def test_parse_vcpkg_json_urls_contains_homepage(tmp_path: Path) -> None: def test_parse_vcpkg_json_urls_empty_without_homepage(tmp_path: Path) -> None: - """urls dict is empty when vcpkg.json has no homepage field.""" + """urls dict is empty when vcpkg.json has no homepage field and no portfile.""" pkg = tmp_path / "pkg" pkg.mkdir() (pkg / "vcpkg.json").write_text(json.dumps({"name": "pkg"}), encoding="utf-8") @@ -273,3 +275,109 @@ def test_parse_vcpkg_json_urls_empty_without_homepage(tmp_path: Path) -> None: assert result is not None assert result.urls == {} + + +# --------------------------------------------------------------------------- +# _github_url_from_portfile +# --------------------------------------------------------------------------- + +_PORTFILE_WITH_GITHUB = """\ +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO faburaya/3fd + REF 3a0fe606268721d1560b88dcca8647c67c0b275c # v2.6.3 (Stable) + SHA512 70630291b4055de2044ad76ef21e99d6ab6fd3468de + HEAD_REF master +) +""" + +_PORTFILE_WITHOUT_GITHUB = """\ +vcpkg_check_linkage(ONLY_STATIC_LIBRARY) +vcpkg_from_gitlab( + GITLAB_URL https://gitlab.com + OUT_SOURCE_PATH SOURCE_PATH + REPO myorg/mylib + REF v1.0 + SHA512 abc123 +) +""" + + +def test_github_url_from_portfile_returns_url(tmp_path: Path) -> None: + """Extracts a https://github.com/owner/repo URL from vcpkg_from_github(REPO ...).""" + pkg = tmp_path / "3fd" + pkg.mkdir() + (pkg / "portfile.cmake").write_text(_PORTFILE_WITH_GITHUB, encoding="utf-8") + + assert _github_url_from_portfile(pkg) == "https://github.com/faburaya/3fd" + + +def test_github_url_from_portfile_returns_none_when_no_portfile(tmp_path: Path) -> None: + """Returns None when neither portfile.cmake nor port.cmake exists.""" + pkg = tmp_path / "pkg" + pkg.mkdir() + + assert _github_url_from_portfile(pkg) is None + + +def test_github_url_from_portfile_returns_none_when_no_vcpkg_from_github(tmp_path: Path) -> None: + """Returns None when portfile.cmake exists but has no vcpkg_from_github call.""" + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "portfile.cmake").write_text(_PORTFILE_WITHOUT_GITHUB, encoding="utf-8") + + assert _github_url_from_portfile(pkg) is None + + +def test_github_url_from_portfile_supports_port_cmake_name(tmp_path: Path) -> None: + """Falls back to port.cmake when portfile.cmake is absent.""" + pkg = tmp_path / "3fd" + pkg.mkdir() + (pkg / "port.cmake").write_text(_PORTFILE_WITH_GITHUB, encoding="utf-8") + + assert _github_url_from_portfile(pkg) == "https://github.com/faburaya/3fd" + + +# --------------------------------------------------------------------------- +# parse_vcpkg_json — portfile fallback +# --------------------------------------------------------------------------- + + +def test_parse_vcpkg_json_uses_portfile_when_no_homepage(tmp_path: Path) -> None: + """homepage is extracted from portfile.cmake when vcpkg.json has none.""" + pkg = tmp_path / "3fd" + pkg.mkdir() + (pkg / "vcpkg.json").write_text(json.dumps({"name": "3fd", "description": "A lib"}), encoding="utf-8") + (pkg / "portfile.cmake").write_text(_PORTFILE_WITH_GITHUB, encoding="utf-8") + + result = parse_vcpkg_json(pkg) + + assert result is not None + assert result.homepage == "https://github.com/faburaya/3fd" + assert result.urls.get("Homepage") == "https://github.com/faburaya/3fd" + + +def test_parse_vcpkg_json_prefers_vcpkg_json_homepage_over_portfile(tmp_path: Path) -> None: + """vcpkg.json homepage takes priority; portfile.cmake is not consulted.""" + pkg = tmp_path / "abseil" + pkg.mkdir() + (pkg / "vcpkg.json").write_text(json.dumps(_VCPKG_JSON), encoding="utf-8") + (pkg / "portfile.cmake").write_text(_PORTFILE_WITH_GITHUB, encoding="utf-8") + + result = parse_vcpkg_json(pkg) + + assert result is not None + assert result.homepage == "https://github.com/abseil/abseil-cpp" + + +def test_parse_vcpkg_json_homepage_none_when_portfile_has_no_github(tmp_path: Path) -> None: + """homepage stays None when portfile.cmake has no vcpkg_from_github call.""" + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "vcpkg.json").write_text(json.dumps({"name": "pkg", "description": "a lib"}), encoding="utf-8") + (pkg / "portfile.cmake").write_text(_PORTFILE_WITHOUT_GITHUB, encoding="utf-8") + + result = parse_vcpkg_json(pkg) + + assert result is not None + assert result.homepage is None diff --git a/test/test_catalog_writer.py b/test/test_catalog_writer.py index 7618160..4276b02 100644 --- a/test/test_catalog_writer.py +++ b/test/test_catalog_writer.py @@ -1,795 +1,352 @@ -"""Tests for dfetch_hub.catalog.writer: catalog JSON writing pipeline. - -Covers: -- parse_vcs_slug: URL parsing and lowercase normalisation. -- _catalog_id: ID string format. -- _merge_catalog_entry: create / update catalog.json entries. -- _generate_readme: fallback README content. -- _merge_detail: create / update per-project detail JSONs. -- write_catalog: full pipeline against a tmp_path data directory. -""" - -from __future__ import annotations - -import json -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from pathlib import Path -from unittest.mock import patch - -import pytest - -from dfetch_hub.catalog.sources import BaseManifest, parse_vcs_slug -from dfetch_hub.catalog.sources.clib import CLibPackage -from dfetch_hub.catalog.writer import ( - _catalog_id, - _fetch_upstream_tags, - _generate_readme, - _merge_catalog_entry, - _merge_detail, - write_catalog, -) - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _manifest( # pylint: disable=too-many-arguments,too-many-positional-arguments - entry_name: str = "abseil", - package_name: str = "abseil-cpp", - description: str = "Abseil C++ libraries from Google", - homepage: str | None = "https://github.com/abseil/abseil-cpp", - license_: str | None = "Apache-2.0", - version: str | None = "20240116.2", -) -> BaseManifest: - """Build a minimal BaseManifest with sensible defaults for testing.""" - return BaseManifest( - entry_name=entry_name, - package_name=package_name, - description=description, - homepage=homepage, - license=license_, - version=version, - ) - - -def _existing_catalog_entry(label: str = "vcpkg") -> dict[str, Any]: - """Return a minimal pre-existing catalog.json entry for abseil-cpp.""" - return { - "id": "github/abseil/abseil-cpp", - "name": "abseil-cpp", - "description": "old description", - "url": "https://github.com/abseil/abseil-cpp", - "source_type": "github", - "default_branch": "main", - "license": None, - "topics": [], - "stars": 0, - "last_updated": "2024-01-01T00:00:00+00:00", - "source_labels": [label], - "tags": [], - } - - -def _existing_detail() -> dict[str, Any]: - """Return a minimal pre-existing per-project detail JSON for abseil-cpp.""" - return { - "canonical_url": "https://github.com/abseil/abseil-cpp", - "org": "abseil", - "repo": "abseil-cpp", - "subfolder_path": None, - "catalog_sources": [ - { - "source_name": "vcpkg", - "label": "vcpkg", - "index_path": "ports/abseil", - "registry_version": "1.0", - } - ], - "manifests": [], - "readme": "placeholder readme", - "tags": [], - "branches": [{"name": "main", "is_tag": False, "commit_sha": None, "date": None}], - "license_text": None, - "fetched_at": "2024-01-01T00:00:00+00:00", - } - - -# --------------------------------------------------------------------------- -# parse_vcs_slug -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "url, expected", - [ - ( - "https://github.com/abseil/abseil-cpp", - ("github.com", "abseil", "abseil-cpp"), - ), - ( - "https://github.com/abseil/abseil-cpp.git", - ("github.com", "abseil", "abseil-cpp"), - ), - ( - "https://github.com/abseil/abseil-cpp/", - ("github.com", "abseil", "abseil-cpp"), - ), - ("http://github.com/foo/bar", ("github.com", "foo", "bar")), - ("https://gitlab.com/org/repo", ("gitlab.com", "org", "repo")), - ( - "https://gitlab.com/group/subgroup/repo", - ("gitlab.com", "group/subgroup", "repo"), - ), - ( - "https://gitlab.com/group/subgroup/nested/repo", - ("gitlab.com", "group/subgroup/nested", "repo"), - ), - ("https://bitbucket.org/org/repo", ("bitbucket.org", "org", "repo")), - ( - "https://gitea.example.com/org/repo", - ("gitea.example.com", "org", "repo"), - ), - ], -) -def test_parse_vcs_slug_valid(url: str, expected: tuple[str, str, str]) -> None: - """Parses host, owner and repo from any https://host/owner/repo URL.""" - assert parse_vcs_slug(url) == expected - - -def test_parse_vcs_slug_lowercases_all_parts() -> None: - """All three parts of the returned tuple are lowercased.""" - assert parse_vcs_slug("https://GitHub.COM/ABSEIL/Abseil-CPP") == ( - "github.com", - "abseil", - "abseil-cpp", - ) - - -@pytest.mark.parametrize( - "url", - [ - "not-a-url", - "", - "https://github.com/only-org", - ], -) -def test_parse_vcs_slug_invalid_returns_none(url: str) -> None: - """Returns None for URLs that do not match host/owner/repo.""" - assert parse_vcs_slug(url) is None - - -# --------------------------------------------------------------------------- -# _catalog_id -# --------------------------------------------------------------------------- - - -def test_catalog_id_format() -> None: - """Produces vcs_host/org/repo format.""" - assert _catalog_id("github", "abseil", "abseil-cpp") == "github/abseil/abseil-cpp" - - -def test_catalog_id_lowercases_inputs() -> None: - """All components are lowercased.""" - assert _catalog_id("GITHUB", "Abseil", "Abseil-CPP") == "github/abseil/abseil-cpp" - - -def test_catalog_id_gitlab() -> None: - """Works for non-GitHub VCS hosts.""" - assert _catalog_id("gitlab", "org", "repo") == "gitlab/org/repo" - - -# --------------------------------------------------------------------------- -# _merge_catalog_entry -# --------------------------------------------------------------------------- - - -def test_merge_catalog_entry_new_has_correct_id() -> None: - """New entry has the correct vcs_host/org/repo ID.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["id"] == "github/abseil/abseil-cpp" - - -def test_merge_catalog_entry_new_populates_name() -> None: - """New entry takes package_name from the manifest.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["name"] == "abseil-cpp" - - -def test_merge_catalog_entry_new_populates_description() -> None: - """New entry takes description from the manifest.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["description"] == "Abseil C++ libraries from Google" - - -def test_merge_catalog_entry_new_populates_license() -> None: - """New entry takes license from the manifest.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["license"] == "Apache-2.0" - - -def test_merge_catalog_entry_source_type_matches_vcs_host() -> None: - """source_type equals the vcs_host passed in.""" - github_entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert github_entry["source_type"] == "github" - - gitlab_entry = _merge_catalog_entry( - None, - _manifest(homepage="https://gitlab.com/org/repo"), - "gitlab", - "org", - "repo", - "some-source", - ) - assert gitlab_entry["source_type"] == "gitlab" - - -def test_merge_catalog_entry_adds_source_label() -> None: - """Label is present in source_labels of the new entry.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert "vcpkg" in entry["source_labels"] - - -def test_merge_catalog_entry_adds_version_tag() -> None: - """Version is recorded in the tags list.""" - entry = _merge_catalog_entry(None, _manifest(version="20240116.2"), "github", "abseil", "abseil-cpp", "vcpkg") - tag_names = {t["name"] for t in entry["tags"]} - assert "20240116.2" in tag_names - - -def test_merge_catalog_entry_no_duplicate_version_tag() -> None: - """The same version is not added a second time.""" - existing = _existing_catalog_entry() - existing["tags"] = [{"name": "20240116.2", "is_tag": True, "commit_sha": None, "date": None}] - entry = _merge_catalog_entry( - existing, - _manifest(version="20240116.2"), - "github", - "abseil", - "abseil-cpp", - "vcpkg", - ) - assert sum(1 for t in entry["tags"] if t["name"] == "20240116.2") == 1 - - -def test_merge_catalog_entry_merges_source_labels() -> None: - """Updating an existing entry preserves its other labels.""" - existing = _existing_catalog_entry(label="conan") - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert "conan" in entry["source_labels"] - assert "vcpkg" in entry["source_labels"] - - -def test_merge_catalog_entry_no_duplicate_label() -> None: - """Applying the same label twice does not duplicate it.""" - existing = _existing_catalog_entry(label="vcpkg") - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["source_labels"].count("vcpkg") == 1 - - -def test_merge_catalog_entry_url_uses_homepage() -> None: - """New entry uses manifest.homepage as the package URL.""" - entry = _merge_catalog_entry(None, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["url"] == "https://github.com/abseil/abseil-cpp" - - -def test_merge_catalog_entry_no_version_no_tag_added() -> None: - """No tag entry is created when version is None.""" - entry = _merge_catalog_entry(None, _manifest(version=None), "github", "abseil", "abseil-cpp", "vcpkg") - assert not entry["tags"] - - -def test_merge_catalog_entry_backfills_missing_description() -> None: - """Existing entry with no description is backfilled from the manifest.""" - existing = _existing_catalog_entry() - existing["description"] = None - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["description"] == "Abseil C++ libraries from Google" - - -def test_merge_catalog_entry_does_not_overwrite_existing_description() -> None: - """An already-populated description must not be replaced by the manifest.""" - existing = _existing_catalog_entry() # description = "old description" - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["description"] == "old description" - - -def test_merge_catalog_entry_backfills_missing_license() -> None: - """Existing entry with no license is backfilled from the manifest.""" - existing = _existing_catalog_entry() # license = None by default - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["license"] == "Apache-2.0" - - -def test_merge_catalog_entry_does_not_overwrite_existing_license() -> None: - """An already-populated license must not be replaced by the manifest.""" - existing = _existing_catalog_entry() - existing["license"] = "MIT" - entry = _merge_catalog_entry(existing, _manifest(), "github", "abseil", "abseil-cpp", "vcpkg") - assert entry["license"] == "MIT" - - -def test_merge_catalog_entry_v_prefix_tag_not_duplicated() -> None: - """Version '1.2.3' is not added if 'v1.2.3' already exists in the tag list.""" - existing = _existing_catalog_entry() - existing["tags"] = [{"name": "v1.2.3", "is_tag": True, "commit_sha": None, "date": None}] - entry = _merge_catalog_entry(existing, _manifest(version="1.2.3"), "github", "abseil", "abseil-cpp", "vcpkg") - assert sum(1 for t in entry["tags"] if t["name"].lstrip("v") == "1.2.3") == 1 - - -# --------------------------------------------------------------------------- -# _generate_readme -# --------------------------------------------------------------------------- - - -def test_generate_readme_contains_package_name() -> None: - """Package name appears in the generated README heading.""" - assert "abseil-cpp" in _generate_readme(_manifest(), "abseil-cpp", "https://github.com/abseil/abseil-cpp") - - -def test_generate_readme_contains_description() -> None: - """Package description appears in the generated README.""" - assert "Abseil C++ libraries" in _generate_readme(_manifest(), "abseil-cpp", "https://github.com/abseil/abseil-cpp") - - -def test_generate_readme_contains_version_tag() -> None: - """Version tag appears in the dfetch.yaml snippet.""" - assert "20240116.2" in _generate_readme( - _manifest(version="20240116.2"), - "abseil-cpp", - "https://github.com/abseil/abseil-cpp", - ) - - -def test_generate_readme_omits_tag_when_no_version() -> None: - """No 'tag:' line is emitted when version is None.""" - readme = _generate_readme(_manifest(version=None), "abseil-cpp", "https://github.com/abseil/abseil-cpp") - assert "tag:" not in readme - - -def test_generate_readme_contains_dfetch_yaml_snippet() -> None: - """The generated README contains a dfetch.yaml code block.""" - readme = _generate_readme(_manifest(), "abseil-cpp", "https://github.com/abseil/abseil-cpp") - assert "dfetch.yaml" in readme - - -def test_generate_readme_uses_provided_url() -> None: - """The URL passed in appears verbatim in the dfetch.yaml snippet.""" - readme = _generate_readme(_manifest(), "myrepo", "https://gitlab.com/myorg/myrepo") - assert "https://gitlab.com/myorg/myrepo" in readme - - -# --------------------------------------------------------------------------- -# _fetch_upstream_tags -# --------------------------------------------------------------------------- - -_FULL_SHA = "a" * 40 # realistic 40-char hex SHA - - -def test_fetch_upstream_tags_returns_tag_entries() -> None: - """Tags are extracted from refs/tags/* entries returned by ls-remote.""" - ls_remote = { - "refs/tags/v1.0.0": _FULL_SHA, - "refs/tags/v2.0.0": "b" * 40, - "refs/heads/main": "c" * 40, # branches must be excluded - } - with patch("dfetch_hub.catalog.writer.GitRemote._ls_remote", return_value=ls_remote): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - tag_names = {t["name"] for t in tags} - assert tag_names == {"v1.0.0", "v2.0.0"}, "branch ref must be excluded" - - -def test_fetch_upstream_tags_excludes_branch_refs() -> None: - """Entries under refs/heads/ are not returned as tags.""" - ls_remote = { - "refs/heads/main": _FULL_SHA, - "refs/heads/dev": "b" * 40, - } - with patch("dfetch_hub.catalog.writer.GitRemote._ls_remote", return_value=ls_remote): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - assert tags == [] - - -def test_fetch_upstream_tags_commit_sha_is_full_length() -> None: - """commit_sha must be the full 40-character SHA, not a shortened form.""" - ls_remote = {"refs/tags/v1.0.0": _FULL_SHA} - with patch("dfetch_hub.catalog.writer.GitRemote._ls_remote", return_value=ls_remote): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - assert len(tags) == 1 - assert tags[0]["commit_sha"] == _FULL_SHA - - -def test_fetch_upstream_tags_is_tag_true() -> None: - """Every entry returned has is_tag set to True.""" - ls_remote = {"refs/tags/v1.0.0": _FULL_SHA} - with patch("dfetch_hub.catalog.writer.GitRemote._ls_remote", return_value=ls_remote): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - assert tags[0]["is_tag"] is True - - -def test_fetch_upstream_tags_name_strips_refs_prefix() -> None: - """The 'refs/tags/' prefix is stripped from the tag name.""" - ls_remote = {"refs/tags/release-2024": _FULL_SHA} - with patch("dfetch_hub.catalog.writer.GitRemote._ls_remote", return_value=ls_remote): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - assert tags[0]["name"] == "release-2024" - - -def test_fetch_upstream_tags_returns_empty_on_error() -> None: - """Returns an empty list when ls-remote raises an exception.""" - with patch( - "dfetch_hub.catalog.writer.GitRemote._ls_remote", - side_effect=RuntimeError("network error"), - ): - tags = _fetch_upstream_tags("https://github.com/owner/repo") - - assert tags == [] - - -# --------------------------------------------------------------------------- -# _merge_detail -# --------------------------------------------------------------------------- - - -def test_merge_detail_new_sets_org_and_repo() -> None: - """New detail record stores org and repo.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(None, _manifest(), "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - assert detail["org"] == "abseil" - assert detail["repo"] == "abseil-cpp" - - -def test_merge_detail_new_adds_catalog_source() -> None: - """New detail record contains exactly one catalog source entry.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(None, _manifest(), "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - sources = detail["catalog_sources"] - assert len(sources) == 1 - assert sources[0]["source_name"] == "vcpkg" - assert sources[0]["label"] == "vcpkg" - - -def test_merge_detail_readme_content_overwrites_generated() -> None: - """readme_content on the manifest (e.g. CLibPackage) replaces the generated placeholder.""" - m = CLibPackage( - entry_name="clibs/buffer", - package_name="buffer", - description="Tiny C buffer library", - homepage="https://github.com/clibs/buffer", - license="MIT", - version="0.4.0", - readme_content="# Real README from upstream", - ) - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(None, m, "clibs", "buffer", "clib", "clib", "clib") - assert detail["readme"] == "# Real README from upstream" - - -def test_merge_detail_readme_content_overwrites_existing_readme() -> None: - """readme_content always overwrites, even when updating an existing detail.""" - m = CLibPackage( - entry_name="clibs/buffer", - package_name="buffer", - description="desc", - homepage="https://github.com/clibs/buffer", - license="MIT", - version="0.4.0", - readme_content="# Fresh README", - ) - existing = _existing_detail() - existing["org"] = "clibs" - existing["repo"] = "buffer" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "clibs", "buffer", "clib", "clib", "clib") - assert detail["readme"] == "# Fresh README" - - -def test_merge_detail_updates_existing_catalog_source() -> None: - """Updating an existing source entry replaces registry_version in-place.""" - existing = _existing_detail() - m = _manifest(version="2.0") - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - assert detail["catalog_sources"][0]["registry_version"] == "2.0" - assert len(detail["catalog_sources"]) == 1 - - -def test_merge_detail_appends_new_catalog_source() -> None: - """A second source is appended, not overwriting the first.""" - existing = _existing_detail() - m = _manifest() - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "abseil", "abseil-cpp", "conan", "conan", "recipes") - source_names = [s["source_name"] for s in detail["catalog_sources"]] - assert "vcpkg" in source_names - assert "conan" in source_names - - -def test_merge_detail_version_tag_added_when_absent() -> None: - """The manifest version is added to the tags list if not already present.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail( - None, - _manifest(version="1.2.3"), - "abseil", - "abseil-cpp", - "vcpkg", - "vcpkg", - "ports", - ) - tag_names = {t["name"] for t in detail["tags"]} - assert "1.2.3" in tag_names - - -def test_merge_detail_version_tag_not_duplicated() -> None: - """The manifest version is not added again if already present (modulo leading v).""" - existing = _existing_detail() - existing["tags"] = [{"name": "v1.2.3", "is_tag": True, "commit_sha": None, "date": None}] - m = _manifest(version="1.2.3") - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - assert sum(1 for t in detail["tags"] if t["name"].lstrip("v") == "1.2.3") == 1 - - -def test_merge_detail_stale_source_name_replaced_not_duplicated() -> None: - """A source entry with the same index_path but an old source_name is replaced. - - This covers the case where a source is renamed in dfetch-hub.toml (e.g. - "vcpkg-source" → "vcpkg"): the old entry must be purged so only one entry - survives, avoiding duplicate catalog_sources entries. - """ - existing = _existing_detail() - # Simulate a stale entry: same index_path ("ports/abseil") but old source_name - existing["catalog_sources"][0]["source_name"] = "vcpkg-source" - - m = _manifest(version="1.0") - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - - source_names = [s["source_name"] for s in detail["catalog_sources"]] - assert source_names == ["vcpkg"], f"expected only 'vcpkg', got {source_names}" - - -def test_merge_detail_urls_propagated_from_manifest() -> None: - """urls from the manifest are written into the detail JSON.""" - m = BaseManifest( - entry_name="abseil", - package_name="abseil-cpp", - description="desc", - homepage="https://github.com/abseil/abseil-cpp", - license=None, - version=None, - urls={ - "Homepage": "https://github.com/abseil/abseil-cpp", - "Source": "https://github.com/x/y", - }, - ) - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(None, m, "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - assert detail["urls"]["Homepage"] == "https://github.com/abseil/abseil-cpp" - assert detail["urls"]["Source"] == "https://github.com/x/y" - - -def test_merge_detail_urls_empty_when_manifest_has_no_urls() -> None: - """urls field in the detail JSON is an empty dict when the manifest carries none.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(None, _manifest(), "abseil", "abseil-cpp", "vcpkg", "vcpkg", "ports") - assert detail["urls"] == {} - - -def test_merge_detail_urls_merged_across_sources() -> None: - """URLs from a second source are merged into the existing urls dict.""" - existing = _existing_detail() - existing["urls"] = {"Homepage": "https://github.com/abseil/abseil-cpp"} - m = BaseManifest( - entry_name="abseil", - package_name="abseil-cpp", - description="desc", - homepage="https://github.com/abseil/abseil-cpp", - license=None, - version=None, - urls={"Source": "https://github.com/conan-io/conan-center-index"}, - ) - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - detail = _merge_detail(existing, m, "abseil", "abseil-cpp", "conan", "conan", "recipes") - assert "Homepage" in detail["urls"] - assert "Source" in detail["urls"] - - -# --------------------------------------------------------------------------- -# write_catalog -# --------------------------------------------------------------------------- - - -def test_write_catalog_writes_catalog_json(tmp_path: Path) -> None: - """A catalog.json file is created in data_dir.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - assert (tmp_path / "catalog.json").exists() - - -def test_write_catalog_entry_in_catalog_json(tmp_path: Path) -> None: - """GitHub entries appear under github/org/repo keys in catalog.json.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) - assert "github/abseil/abseil-cpp" in catalog - - -def test_write_catalog_writes_detail_json(tmp_path: Path) -> None: - """Detail JSON is written to data/github/org/repo.json for GitHub packages.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - detail_path = tmp_path / "github" / "abseil" / "abseil-cpp.json" - assert detail_path.exists() - detail = json.loads(detail_path.read_text(encoding="utf-8")) - assert detail["org"] == "abseil" - assert detail["repo"] == "abseil-cpp" - - -def test_write_catalog_returns_added_count(tmp_path: Path) -> None: - """Two distinct packages each increment the added counter.""" - boost = _manifest( - entry_name="boost", - package_name="boost", - homepage="https://github.com/boostorg/boost", - description="Boost C++ libraries", - ) - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - added, updated = write_catalog( - [_manifest(), boost], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - assert added == 2 - assert updated == 0 - - -def test_write_catalog_returns_updated_count(tmp_path: Path) -> None: - """Processing the same package twice increments the updated counter.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - added, updated = write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - assert added == 0 - assert updated == 1 - - -def test_write_catalog_skips_manifest_without_homepage(tmp_path: Path) -> None: - """Manifests with no homepage at all are silently skipped.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - added, updated = write_catalog( - [_manifest(homepage=None)], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) - assert len(catalog) == 0 - assert added == 0 - assert updated == 0 - - -def test_write_catalog_skips_unrecognized_url(tmp_path: Path) -> None: - """Manifests whose homepage cannot be parsed as host/owner/repo are skipped.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - added, updated = write_catalog( - [_manifest(homepage="https://example.com/not-a-repo")], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) - assert len(catalog) == 0 - assert added == 0 - assert updated == 0 - - -def test_write_catalog_accepts_gitlab_homepage(tmp_path: Path) -> None: - """GitLab-hosted packages are written under the gitlab/ directory.""" - gitlab_manifest = _manifest( - entry_name="mylib", - package_name="mylib", - homepage="https://gitlab.com/myorg/mylib", - description="A library on GitLab", - ) - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - added, updated = write_catalog( - [gitlab_manifest], - tmp_path, - source_name="some-source", - label="some-source", - registry_path="packages", - ) - catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) - assert "gitlab/myorg/mylib" in catalog - assert added == 1 - assert updated == 0 - detail_path = tmp_path / "gitlab" / "myorg" / "mylib.json" - assert detail_path.exists() - - -def test_write_catalog_merges_across_two_sources(tmp_path: Path) -> None: - """Same package from two separate sources should be merged into one entry.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - write_catalog( - [_manifest()], - tmp_path, - source_name="conan", - label="conan", - registry_path="recipes", - ) - catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) - entry = catalog["github/abseil/abseil-cpp"] - assert "vcpkg" in entry["source_labels"] - assert "conan" in entry["source_labels"] - - -def test_write_catalog_detail_json_has_both_sources(tmp_path: Path) -> None: - """Detail JSON lists both sources after two write_catalog calls.""" - with patch("dfetch_hub.catalog.writer._fetch_upstream_tags", return_value=[]): - write_catalog( - [_manifest()], - tmp_path, - source_name="vcpkg", - label="vcpkg", - registry_path="ports", - ) - write_catalog( - [_manifest()], - tmp_path, - source_name="conan", - label="conan", - registry_path="recipes", - ) - detail = json.loads((tmp_path / "github" / "abseil" / "abseil-cpp.json").read_text(encoding="utf-8")) - source_names = [s["source_name"] for s in detail["catalog_sources"]] - assert "vcpkg" in source_names - assert "conan" in source_names +"""Tests for dfetch_hub.catalog.writer: Catalog and CatalogWriter.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING +from unittest.mock import patch + +import pytest + +if TYPE_CHECKING: + from pathlib import Path + + from dfetch_hub.catalog.sources import BaseManifest + +from dfetch_hub.catalog.entry import CatalogEntry +from dfetch_hub.catalog.writer import Catalog, CatalogWriter + + +@pytest.fixture(autouse=True) +def mock_fetch_tags(): + """Mock fetch_upstream_tags to prevent network access in all tests.""" + with patch("dfetch_hub.catalog.detail.CatalogDetail.fetch_upstream_tags", return_value=[]): + yield + + +def _manifest( + entry_name: str = "abseil", + package_name: str = "abseil-cpp", + description: str = "Abseil C++ libraries from Google", + homepage: str | None = "https://github.com/abseil/abseil-cpp", + license_: str | None = "Apache-2.0", + version: str | None = "20240116.2", + subpath: str | None = None, +) -> "BaseManifest": + from dfetch_hub.catalog.sources import BaseManifest + + return BaseManifest( + entry_name=entry_name, + package_name=package_name, + description=description, + homepage=homepage, + license=license_, + version=version, + subpath=subpath, + ) + + +def _monorepo_manifest(name: str) -> "BaseManifest": + """Build a manifest for a sub-component of a monorepo.""" + return _manifest( + entry_name=name, + package_name=name, + homepage="https://github.com/myorg/mymonorepo", + subpath=name, + ) + + +# --------------------------------------------------------------------------- +# Catalog +# --------------------------------------------------------------------------- + + +def test_catalog_load_returns_empty_when_missing(tmp_path: Path) -> None: + """load returns empty catalog when file doesn't exist.""" + catalog = Catalog.load(tmp_path / "catalog.json") + assert catalog.entries == {} + + +def test_catalog_dump_and_load_roundtrip(tmp_path: Path) -> None: + """dump + load preserves data.""" + original = Catalog() + original.entries["test"] = CatalogEntry(cat_id="test", name="test") + path = tmp_path / "catalog.json" + original.dump(path) + restored = Catalog.load(path) + assert "test" in restored.entries + + +def test_catalog_get_or_create_creates_new() -> None: + """get_or_create_entry creates new entry.""" + catalog = Catalog() + _, is_new = catalog.get_or_create_entry( + _manifest(), + "github", + "org", + "repo", + "label", + ) + assert is_new + assert "github/org/repo" in catalog.entries + + +def test_catalog_get_or_create_returns_existing() -> None: + """get_or_create_entry returns existing entry.""" + catalog = Catalog() + catalog.entries["github/org/repo"] = CatalogEntry(cat_id="github/org/repo", name="test") + _, is_new = catalog.get_or_create_entry( + _manifest(), + "github", + "org", + "repo", + "label", + ) + assert not is_new + + +# --------------------------------------------------------------------------- +# CatalogWriter +# --------------------------------------------------------------------------- + + +def test_catalog_writer_write_creates_files(tmp_path: Path) -> None: + """write creates catalog.json and detail JSON.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + assert (tmp_path / "catalog.json").exists() + assert (tmp_path / "github" / "abseil" / "abseil-cpp.json").exists() + + +def test_catalog_writer_write_returns_counts(tmp_path: Path) -> None: + """write returns added/updated counts.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + added, updated = writer.write([_manifest()]) + assert added == 1 + assert updated == 0 + + +def test_catalog_writer_write_skips_without_homepage(tmp_path: Path) -> None: + """Manifests with no homepage are skipped.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + added, updated = writer.write([_manifest(homepage=None)]) + assert added == 0 + assert updated == 0 + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert len(catalog) == 0 + + +def test_catalog_writer_write_skips_unrecognized_url(tmp_path: Path) -> None: + """Manifests with unparsable URLs are skipped.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + added, updated = writer.write([_manifest(homepage="https://example.com/not-a-repo")]) + assert added == 0 + assert updated == 0 + + +def test_write_catalog_writes_catalog_json(tmp_path: Path) -> None: + """A catalog.json file is created in data_dir.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + assert (tmp_path / "catalog.json").exists() + + +def test_write_catalog_entry_in_catalog_json(tmp_path: Path) -> None: + """GitHub entries appear under github/org/repo keys in catalog.json.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert "github/abseil/abseil-cpp" in catalog + + +def test_write_catalog_writes_detail_json(tmp_path: Path) -> None: + """Detail JSON is written to data/github/org/repo.json for GitHub packages.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + detail_path = tmp_path / "github" / "abseil" / "abseil-cpp.json" + assert detail_path.exists() + detail = json.loads(detail_path.read_text(encoding="utf-8")) + assert detail["org"] == "abseil" + assert detail["repo"] == "abseil-cpp" + + +def test_write_catalog_returns_added_count(tmp_path: Path) -> None: + """Two distinct packages each increment the added counter.""" + boost = _manifest( + entry_name="boost", + package_name="boost", + homepage="https://github.com/boostorg/boost", + description="Boost C++ libraries", + ) + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + added, updated = writer.write([_manifest(), boost]) + assert added == 2 + assert updated == 0 + + +def test_write_catalog_returns_updated_count(tmp_path: Path) -> None: + """Processing the same package twice increments the updated counter.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + added, updated = writer.write([_manifest()]) + assert added == 0 + assert updated == 1 + + +def test_write_catalog_accepts_gitlab_homepage(tmp_path: Path) -> None: + """GitLab-hosted packages are written under the gitlab/ directory.""" + gitlab_manifest = _manifest( + entry_name="mylib", + package_name="mylib", + homepage="https://gitlab.com/myorg/mylib", + description="A library on GitLab", + ) + writer = CatalogWriter(tmp_path, "some-source", "some-source", "packages") + added, updated = writer.write([gitlab_manifest]) + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert "gitlab/myorg/mylib" in catalog + assert added == 1 + assert updated == 0 + + +def test_write_catalog_merges_across_two_sources(tmp_path: Path) -> None: + """Same package from two separate sources should be merged into one entry.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + writer = CatalogWriter(tmp_path, "conan", "conan", "recipes") + writer.write([_manifest()]) + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + entry = catalog["github/abseil/abseil-cpp"] + assert "vcpkg" in entry["source_labels"] + assert "conan" in entry["source_labels"] + + +def test_write_catalog_detail_json_has_both_sources(tmp_path: Path) -> None: + """Detail JSON lists both sources after two write_catalog calls.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + writer = CatalogWriter(tmp_path, "conan", "conan", "recipes") + writer.write([_manifest()]) + detail = json.loads((tmp_path / "github" / "abseil" / "abseil-cpp.json").read_text(encoding="utf-8")) + source_names = [s["source_name"] for s in detail["catalog_sources"]] + assert "vcpkg" in source_names + assert "conan" in source_names + + +def test_write_catalog_monorepo_components_get_distinct_ids(tmp_path: Path) -> None: + """Two components from the same repo with different subpaths get distinct catalog IDs.""" + foo = _monorepo_manifest("foo") + bar = _monorepo_manifest("bar") + writer = CatalogWriter(tmp_path, "readme", "readme", "packages") + added, updated = writer.write([foo, bar]) + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert "github/myorg/mymonorepo/foo" in catalog + assert "github/myorg/mymonorepo/bar" in catalog + assert added == 2 + assert updated == 0 + + +def test_write_catalog_monorepo_components_get_distinct_detail_paths(tmp_path: Path) -> None: + """Each monorepo component is written to its own detail JSON file.""" + foo = _monorepo_manifest("foo") + bar = _monorepo_manifest("bar") + writer = CatalogWriter(tmp_path, "readme", "readme", "packages") + writer.write([foo, bar]) + foo_path = tmp_path / "github" / "myorg" / "mymonorepo" / "foo.json" + bar_path = tmp_path / "github" / "myorg" / "mymonorepo" / "bar.json" + assert foo_path.exists() + assert bar_path.exists() + + +def test_write_catalog_monorepo_detail_contains_subfolder_path(tmp_path: Path) -> None: + """The detail JSON for a monorepo component stores its subfolder_path.""" + foo = _monorepo_manifest("foo") + writer = CatalogWriter(tmp_path, "readme", "readme", "packages") + writer.write([foo]) + detail = json.loads((tmp_path / "github" / "myorg" / "mymonorepo" / "foo.json").read_text(encoding="utf-8")) + assert detail["subfolder_path"] == "foo" + + +def test_write_catalog_non_monorepo_detail_path_unchanged(tmp_path: Path) -> None: + """Packages without a subpath continue to use the flat .json path.""" + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write([_manifest()]) + assert (tmp_path / "github" / "abseil" / "abseil-cpp.json").exists() + + +def test_write_catalog_monorepo_catalog_id_in_entry(tmp_path: Path) -> None: + """The id field inside the catalog entry includes the subpath.""" + foo = _monorepo_manifest("foo") + writer = CatalogWriter(tmp_path, "readme", "readme", "packages") + writer.write([foo]) + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert catalog["github/myorg/mymonorepo/foo"]["id"] == "github/myorg/mymonorepo/foo" + + +def test_write_catalog_nested_subpath(tmp_path: Path) -> None: + """Nested subpath like 'libs/foo' is preserved in catalog ID and detail file.""" + m = _manifest( + entry_name="libs/foo", + package_name="foo", + homepage="https://github.com/myorg/mymonorepo", + subpath="libs/foo", + ) + writer = CatalogWriter(tmp_path, "readme", "readme", "packages") + writer.write([m]) + + catalog = json.loads((tmp_path / "catalog.json").read_text(encoding="utf-8")) + assert "github/myorg/mymonorepo/libs/foo" in catalog + assert catalog["github/myorg/mymonorepo/libs/foo"]["id"] == "github/myorg/mymonorepo/libs/foo" + + detail_path = tmp_path / "github" / "myorg" / "mymonorepo" / "libs" / "foo.json" + assert detail_path.exists() + detail = json.loads(detail_path.read_text(encoding="utf-8")) + assert detail["subfolder_path"] == "libs/foo" + + +# --------------------------------------------------------------------------- +# Root-entry removal guard (regression for cross-source clobber) +# --------------------------------------------------------------------------- + + +def test_write_manifest_removes_stale_root_entry_from_same_source(tmp_path: Path) -> None: + """Root entry created by the same source is replaced by a subpath entry.""" + catalog = Catalog() + catalog.entries["github/myorg/mymonorepo"] = CatalogEntry( + cat_id="github/myorg/mymonorepo", + name="mymonorepo", + source_labels=["vcpkg"], + ) + foo = _monorepo_manifest("foo") + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write_manifest(foo, catalog) + + assert "github/myorg/mymonorepo" not in catalog.entries + assert "github/myorg/mymonorepo/foo" in catalog.entries + + +def test_write_manifest_preserves_root_entry_from_different_source(tmp_path: Path) -> None: + """Root entry created by a different source is not removed when a subpath manifest arrives.""" + catalog = Catalog() + catalog.entries["github/myorg/mymonorepo"] = CatalogEntry( + cat_id="github/myorg/mymonorepo", + name="mymonorepo", + source_labels=["clib"], + ) + foo = _monorepo_manifest("foo") + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + writer.write_manifest(foo, catalog) + + assert "github/myorg/mymonorepo" in catalog.entries + assert "github/myorg/mymonorepo/foo" in catalog.entries + + +def test_write_manifest_no_root_entry_is_a_noop(tmp_path: Path) -> None: + """Processing a subpath manifest when no root entry exists does not raise.""" + catalog = Catalog() + foo = _monorepo_manifest("foo") + writer = CatalogWriter(tmp_path, "vcpkg", "vcpkg", "ports") + was_added, _ = writer.write_manifest(foo, catalog) + + assert was_added + assert "github/myorg/mymonorepo/foo" in catalog.entries diff --git a/test/test_commands_update.py b/test/test_commands_update.py new file mode 100644 index 0000000..18679c7 --- /dev/null +++ b/test/test_commands_update.py @@ -0,0 +1,169 @@ +"""Tests for dfetch_hub.commands.update._parse_entry_dirs.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from dfetch_hub.catalog.sources import BaseManifest +from dfetch_hub.commands.update import _parse_entry_dirs # noqa: PLC2701 + +if TYPE_CHECKING: + from pathlib import Path + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_FALLBACK_URL = "https://github.com/org/monorepo" + + +def _make_manifest( + subpath: str | None = None, + homepage: str | None = _FALLBACK_URL, + in_project_repo: bool = False, +) -> BaseManifest: + """Return a minimal BaseManifest for use in parse_fn stubs.""" + return BaseManifest( + entry_name="pkg", + package_name="pkg", + description="", + homepage=homepage, + license=None, + version=None, + subpath=subpath, + in_project_repo=in_project_repo, + ) + + +# --------------------------------------------------------------------------- +# _parse_entry_dirs — subpath handling +# --------------------------------------------------------------------------- + + +def test_parse_entry_dirs_assigns_dir_name_as_subpath_when_in_project_repo(tmp_path: Path) -> None: + """entry_dir.name is used as subpath when the parser signals in_project_repo=True.""" + pkg_dir = tmp_path / "zlib" + pkg_dir.mkdir() + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(subpath=None, in_project_repo=True) + + manifests, skipped = _parse_entry_dirs([pkg_dir], parse_fn, _FALLBACK_URL) + + assert skipped == 0 + assert len(manifests) == 1 + assert manifests[0].in_project_repo is True + assert manifests[0].subpath == "zlib" + + +def test_parse_entry_dirs_preserves_parser_provided_subpath(tmp_path: Path) -> None: + """Parser-provided subpath is not overwritten even when in_project_repo=True.""" + pkg_dir = tmp_path / "zlib" + pkg_dir.mkdir() + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(subpath="deep/zlib", in_project_repo=True) + + manifests, _ = _parse_entry_dirs([pkg_dir], parse_fn, _FALLBACK_URL) + + assert manifests[0].in_project_repo is True + assert manifests[0].subpath == "deep/zlib" + + +def test_parse_entry_dirs_does_not_assign_subpath_when_not_in_project_repo(tmp_path: Path) -> None: + """Subpath is not set when the parser signals in_project_repo=False (registry entry). + + A registry entry (e.g. a vcpkg port or a conan recipe) whose manifest does not + live in the project's own repository must not receive a subfolder_path derived + from the containing directory name. + """ + pkg_dir = tmp_path / "7bitdi" + pkg_dir.mkdir() + own_homepage = "https://github.com/7bitcoder/7bitDI" + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(subpath=None, homepage=own_homepage, in_project_repo=False) + + manifests, _ = _parse_entry_dirs([pkg_dir], parse_fn, _FALLBACK_URL) + + assert manifests[0].homepage == own_homepage + assert manifests[0].in_project_repo is False + assert manifests[0].subpath is None + + +def test_parse_entry_dirs_does_not_assign_subpath_without_fallback(tmp_path: Path) -> None: + """No subpath is assigned when fallback_homepage is None.""" + pkg_dir = tmp_path / "zlib" + pkg_dir.mkdir() + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(subpath=None, homepage="https://github.com/madler/zlib", in_project_repo=False) + + manifests, _ = _parse_entry_dirs([pkg_dir], parse_fn, None) + + assert manifests[0].in_project_repo is False + assert manifests[0].subpath is None + + +# --------------------------------------------------------------------------- +# _parse_entry_dirs — homepage fallback +# --------------------------------------------------------------------------- + + +def test_parse_entry_dirs_fills_homepage_from_fallback_when_none(tmp_path: Path) -> None: + """manifest.homepage is populated from fallback_homepage when originally None.""" + pkg_dir = tmp_path / "zlib" + pkg_dir.mkdir() + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(homepage=None, in_project_repo=True) + + manifests, _ = _parse_entry_dirs([pkg_dir], parse_fn, _FALLBACK_URL) + + assert manifests[0].homepage == _FALLBACK_URL + assert manifests[0].in_project_repo is True + + +def test_parse_entry_dirs_does_not_overwrite_existing_homepage(tmp_path: Path) -> None: + """manifest.homepage is not replaced when the parser already set one.""" + pkg_dir = tmp_path / "zlib" + pkg_dir.mkdir() + upstream = "https://github.com/madler/zlib" + + def parse_fn(_p: Path) -> BaseManifest | None: + return _make_manifest(homepage=upstream, in_project_repo=False) + + manifests, _ = _parse_entry_dirs([pkg_dir], parse_fn, _FALLBACK_URL) + + assert manifests[0].homepage == upstream + assert manifests[0].in_project_repo is False + + +# --------------------------------------------------------------------------- +# _parse_entry_dirs — skip counting +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("n_good,n_bad", [(0, 3), (2, 1), (3, 0)]) +def test_parse_entry_dirs_counts_skipped_correctly(tmp_path: Path, n_good: int, n_bad: int) -> None: + """Directories whose parse_fn returns None are counted as skipped.""" + dirs = [] + for i in range(n_good): + d = tmp_path / f"good{i}" + d.mkdir() + dirs.append(d) + for i in range(n_bad): + d = tmp_path / f"bad{i}" + d.mkdir() + dirs.append(d) + + def parse_fn(p: Path) -> BaseManifest | None: + return _make_manifest() if "good" in p.name else None + + manifests, skipped = _parse_entry_dirs(dirs, parse_fn, None) + + assert len(manifests) == n_good + assert skipped == n_bad