diff --git a/requirements.txt b/requirements.txt index dbda64e0c..19936b166 100644 --- a/requirements.txt +++ b/requirements.txt @@ -126,3 +126,5 @@ wcwidth==0.2.5 websocket-client==0.59.0 yarl==1.7.2 zipp==3.19.1 +PyGithub==2.6.1 +python-gitlab~=7.1.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index c104497ab..7016aa57e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,6 +90,7 @@ install_requires = # networking GitPython>=3.1.17 + PyGithub>=2.6.1 requests>=2.25.1 fetchcode>=0.6.0 diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 72e4ea4b3..758ab4192 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -44,6 +44,7 @@ from vulnerabilities.pipelines.v2_importers import aosp_importer as aosp_importer_v2 from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2 +from vulnerabilities.pipelines.v2_importers import collect_issue_pr as collect_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, @@ -100,6 +101,7 @@ epss_importer_v2.EPSSImporterPipeline, nginx_importer_v2.NginxImporterPipeline, mattermost_importer_v2.MattermostImporterPipeline, + collect_issue_pr_v2.CollectIssuePRPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, @@ -135,5 +137,7 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + collect_issue_pr_v2.CollectKubernetesPRSIssues, + collect_issue_pr_v2.CollectWiresharkPRSIssues, ] ) diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index 9efd58c05..dddb1bb6d 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -8,25 +8,33 @@ # import logging +import re import traceback +from abc import abstractmethod +from collections import defaultdict from datetime import datetime from datetime import timezone from timeit import default_timer as timer from traceback import format_exc as traceback_format_exc from typing import Iterable from typing import List +from urllib.parse import urlparse +import gitlab from aboutcode.pipeline import LoopProgress from aboutcode.pipeline import PipelineDefinition from aboutcode.pipeline import humanize_time +from github import Github from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 from vulnerabilities.improver import MAX_CONFIDENCE from vulnerabilities.models import Advisory from vulnerabilities.models import PipelineRun from vulnerabilities.pipes.advisory import import_advisory from vulnerabilities.pipes.advisory import insert_advisory from vulnerabilities.pipes.advisory import insert_advisory_v2 +from vulnerablecode.settings import env module_logger = logging.getLogger(__name__) @@ -321,3 +329,99 @@ def collect_and_store_advisories(self): continue self.log(f"Successfully collected {collected_advisory_count:,d} advisories") + + +class VCSCollector(VulnerableCodeBaseImporterPipeline): + """ + Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities. + """ + + vcs_url: str + CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) + SUPPORTED_IDENTIFIERS = ["CVE-"] + + collected_items: dict = {} + + def advisories_count(self) -> int: + return 0 + + @classmethod + def steps(cls): + return ( + cls.configure_target, + cls.fetch_entries, + cls.collect_items, + cls.collect_and_store_advisories, + ) + + def configure_target(self): + parsed_url = urlparse(self.repo_url) + parts = parsed_url.path.strip("/").split("/") + if len(parts) < 2: + raise ValueError(f"Invalid URL: {self.repo_url}") + + self.repo_name = f"{parts[0]}/{parts[1]}" + + @abstractmethod + def fetch_entries(self): + raise NotImplementedError + + @abstractmethod + def collect_items(self): + raise NotImplementedError + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub/Gitlab issues and PRs. + """ + self.log("Generating AdvisoryData objects from GitHub/Gitlab issues and PRs.") + for vuln_id, refs in self.collected_items.items(): + print(vuln_id, refs) + references = [ReferenceV2(reference_type=ref_id, url=url) for ref_id, url in refs] + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[], + references_v2=references, + url=self.repo_url, + ) + + +class GitHubCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitHub Data Entries""" + github_token = env.str("GITHUB_TOKEN") + g = Github(login_or_token=github_token) + base_query = f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" + self.issues = g.search_issues(f"{base_query} is:issue") + self.prs = g.search_issues(f"{base_query} is:pr") + + def collect_items(self): + self.collected_items = defaultdict(list) + + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for item in items: + matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) + for match in matches: + self.collected_items[match].append(("Issue", item.html_url)) + + +class GitLabCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitLab Data Entries""" + gitlab_token = env.str("GITLAB_TOKEN") + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) + project = gl.projects.get(self.repo_name) + base_query = " ".join(self.SUPPORTED_IDENTIFIERS) + self.issues = project.search(scope="issues", search=base_query) + self.prs = project.search(scope="merge_requests", search=base_query) + + def collect_items(self): + self.collected_items = defaultdict(list) + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for item in items: + title = item.get("title") or "" + description = item.get("description") or "" + matches = self.CVE_PATTERN.findall(title + " " + description) + for match in matches: + url = item.get("web_url") + self.collected_items[match].append((i_type, url)) diff --git a/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py b/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py new file mode 100644 index 000000000..85b4bc621 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py @@ -0,0 +1,20 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +from vulnerabilities.pipelines import GitHubCollector +from vulnerabilities.pipelines import GitLabCollector + + +class CollectKubernetesPRSIssues(GitHubCollector): + pipeline_id = "collect-kubernetes-prs-issues" + repo_url = "https://github.com/kubernetes/kubernetes" + + +class CollectWiresharkPRSIssues(GitLabCollector): + pipeline_id = "collect-wireshark-prs-issues" + repo_url = "https://gitlab.com/wireshark/wireshark" diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py new file mode 100644 index 000000000..41e8477ba --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py @@ -0,0 +1,126 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from vulnerabilities.pipelines import GitHubCollector +from vulnerabilities.pipelines import GitLabCollector +from vulnerabilities.tests import util_tests + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "collect_issue_pr" + + +@pytest.mark.django_db +def test_collect_github_issues_and_prs(): + pipeline = GitHubCollector() + pipeline.issues = [ + SimpleNamespace( + title="Fix the CVE-2023-1234 found", + body="This resolves a security issue", + html_url="https://github.com/issue1", + ), + SimpleNamespace( + title="vulnerability 1", + body="Fix CVE-2023-124", + html_url="https://github.com/issue2", + ), + SimpleNamespace( + title="vulnerability 2", + body="vulnerability 2", + html_url="https://github.com/issue3", + ), + ] + + pipeline.prs = [ + SimpleNamespace( + title="Patch addressing CVE-2023-1234", + body="Also fixes CVE-2023-1234", + html_url="https://github.com/pr1", + ) + ] + + pipeline.collect_items() + expected = { + "CVE-2023-1234": [ + ("Issue", "https://github.com/issue1"), + ("Issue", "https://github.com/pr1"), + ("Issue", "https://github.com/pr1"), + ], + "CVE-2023-124": [("Issue", "https://github.com/issue2")], + } + + assert pipeline.collected_items == expected + + +@pytest.mark.django_db +def test_collect_gitlab_issues_and_prs(): + pipeline = GitLabCollector() + pipeline.issues = [ + { + "title": "vulnerability CVE-2024-1234", + "description": "vulnerability 1", + "web_url": "https://github.com/issue1", + }, + ] + + pipeline.prs = [ + { + "title": "Patch addressing", + "description": "Also fixes CVE-2023-1234", + "web_url": "https://github.com/pr1", + } + ] + + pipeline.collect_items() + expected = { + "CVE-2024-1234": [("Issue", "https://github.com/issue1")], + "CVE-2023-1234": [("PR", "https://github.com/pr1")], + } + + assert pipeline.collected_items == expected + + +@pytest.mark.parametrize( + "input_file, expected_file, repo_url, pipeline_class", + [ + ( + "github_issues_and_pr.json", + "expected_github.json", + "https://github.com/test/repo", + GitHubCollector, + ), + ( + "gitlab_issues_and_pr.json", + "expected_gitlab.json", + "https://gitlab.com/test/repo", + GitLabCollector, + ), + ], +) +@pytest.mark.django_db +def test_collect_advisories_from_json(input_file, expected_file, repo_url, pipeline_class): + input_file = TEST_DATA / input_file + expected_file = TEST_DATA / expected_file + + issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = pipeline_class() + pipeline.pipeline_id = "collect-prs-issues" + pipeline.repo_url = repo_url + pipeline.log = MagicMock() + + pipeline.collect_items = MagicMock(return_value=issues_and_prs) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + util_tests.check_results_against_json(result, expected_file) diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json b/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json b/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/github_issues_and_pr.json b/vulnerabilities/tests/test_data/collect_issue_pr/github_issues_and_pr.json new file mode 100644 index 000000000..2f68eab98 --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/github_issues_and_pr.json @@ -0,0 +1,24 @@ +{ + "CVE-2023-1234": [ + [ + "Issue", + "https://example.com/issue1" + ], + [ + "PR", + "https://example.com/pr1" + ] + ], + "GHSA-zzz-111": [ + [ + "PR", + "https://example.com/pr1" + ] + ], + "PYSEC-2024-5678": [ + [ + "PR", + "https://example.com/pr1" + ] + ] +} \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json b/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json @@ -0,0 +1 @@ +{} \ No newline at end of file