From 3dd4af63bbd55ced5c6d26f8d699661838a0d6dc Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 14 Oct 2025 17:40:15 +0300 Subject: [PATCH 1/5] Add support for collecting GitHub vulnerability-related issues and pull requests Add tests for this functionality Signed-off-by: ziad hany --- vulnerabilities/importers/__init__.py | 2 + .../pipelines/v2_importers/github_issue_pr.py | 92 +++++++++++++++++++ .../v2_importers/test_github_issue_pr.py | 80 ++++++++++++++++ .../expected_advisory_output.json | 64 +++++++++++++ .../github_issue_pr/issues_and_pr.json | 24 +++++ 5 files changed, 262 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/github_issue_pr.py create mode 100644 vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py create mode 100644 vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json create mode 100644 vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 72e4ea4b3..2c7f61463 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -48,6 +48,7 @@ from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) +from vulnerabilities.pipelines.v2_importers import github_issue_pr as github_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import epss_importer_v2 from vulnerabilities.pipelines.v2_importers import fireeye_importer_v2 from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 @@ -135,5 +136,6 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + github_issue_pr_v2.GithubPipelineIssuePR, ] ) diff --git a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py new file mode 100644 index 000000000..ec33e925e --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py @@ -0,0 +1,92 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import re +from collections import defaultdict + +from github import Github + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerablecode.settings import env + +GITHUB_TOKEN = env.str("GITHUB_TOKEN") + + +class GithubPipelineIssuePR(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect GitHub issues and PRs related to vulnerabilities. + """ + + pipeline_id = "collect_issues_pr" + + @classmethod + def steps(cls): + return ( + cls.fetch_entries, + cls.collect_and_store_advisories, + ) + + def fetch_entries(self): + """Clone the repository.""" + self.repo_url = "https://github.com/torvalds/linux" + repo_name = "django/django" + + g = Github(login_or_token=GITHUB_TOKEN) + + base_query = f"repo:{repo_name} (CVE OR PYSEC OR GHSA)" + self.issues = g.search_issues(f"{base_query} is:issue") + self.pull_requestes = g.search_issues(f"{base_query} is:pr") + + def advisories_count(self) -> int: + """ + Return total number of advisories discovered (issues + PRs). + """ + return self.issues.totalCount + self.pull_requestes.totalCount + + def collect_issues_and_prs(self): + """ + Group issues and PRs by vulnerability identifiers (like CVE-xxxx-yyyy). + Returns a dict mapping vuln_id -> [(type, html_url)]. + """ + self.log("Grouping GitHub issues and PRs by vulnerability identifiers.") + + grouped_items = defaultdict(list) + pattern = re.compile(r"(CVE-\d{4}-\d+|PYSEC-\d{4}-\d+|GHSA-[\w-]+)", re.IGNORECASE) + + for issue in self.issues: + matches = pattern.findall(issue.title + " " + (issue.body or "")) + for match in matches: + grouped_items[match].append(("Issue", issue.html_url)) + + for pr in self.pull_requestes: + matches = pattern.findall(pr.title + " " + (pr.body or "")) + for match in matches: + grouped_items[match].append(("PR", pr.html_url)) + + self.log(f"Grouped {len(grouped_items)} unique vulnerability identifiers.") + return grouped_items + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub issues and PRs. + """ + self.log("Generating AdvisoryData objects from GitHub issues and PRs.") + grouped_data = self.collect_issues_and_prs() + + for vuln_id, refs in grouped_data.items(): + references = [ReferenceV2(reference_id=ref_id, url=url) for ref_id, url in refs] + + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[vuln_id], + references_v2=references, + url=self.repo_url, + ) diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py new file mode 100644 index 000000000..e2b80f00f --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py @@ -0,0 +1,80 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from vulnerabilities.pipelines.v2_importers.github_issue_pr import GithubPipelineIssuePR +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = GithubPipelineIssuePR() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + return pipeline + + +@pytest.mark.django_db +def test_collect_issues_and_prs(pipeline): + pipeline.issues = [ + SimpleNamespace( + title="Fix for CVE-2023-1234 found", + body="This resolves a security issue", + html_url="http://example.com/issue1", + ), + SimpleNamespace( + title="No vulnerability mentioned", + body="This is unrelated", + html_url="http://example.com/issue2", + ), + ] + + pipeline.pull_requestes = [ + SimpleNamespace( + title="Patch addressing GHSA-zzz-111", + body="Also fixes PYSEC-2024-5678", + html_url="http://example.com/pr1", + ) + ] + + result = pipeline.collect_issues_and_prs() + expected = { + "CVE-2023-1234": [("Issue", "http://example.com/issue1")], + "GHSA-zzz-111": [("PR", "http://example.com/pr1")], + "PYSEC-2024-5678": [("PR", "http://example.com/pr1")], + } + + assert result == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "github_issue_pr" + + +@pytest.mark.django_db +def test_collect_advisories_from_json(): + input_file = TEST_DATA / "issues_and_pr.json" + expected_file = TEST_DATA / "expected_advisory_output.json" + + issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = GithubPipelineIssuePR() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + + pipeline.collect_issues_and_prs = MagicMock(return_value=issues_and_prs) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file) diff --git a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json new file mode 100644 index 000000000..3ac486d96 --- /dev/null +++ b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json @@ -0,0 +1,64 @@ +[ + { + "advisory_id": "CVE-2023-1234", + "aliases": [ + "CVE-2023-1234" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "Issue", + "reference_type": "", + "url": "https://example.com/issue1" + }, + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-zzz-111", + "aliases": [ + "GHSA-zzz-111" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "PYSEC-2024-5678", + "aliases": [ + "PYSEC-2024-5678" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json b/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json new file mode 100644 index 000000000..2f68eab98 --- /dev/null +++ b/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json @@ -0,0 +1,24 @@ +{ + "CVE-2023-1234": [ + [ + "Issue", + "https://example.com/issue1" + ], + [ + "PR", + "https://example.com/pr1" + ] + ], + "GHSA-zzz-111": [ + [ + "PR", + "https://example.com/pr1" + ] + ], + "PYSEC-2024-5678": [ + [ + "PR", + "https://example.com/pr1" + ] + ] +} \ No newline at end of file From 3b39e3c10d72264a280398efe500e84fb200624d Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 24 Jan 2026 00:25:56 +0200 Subject: [PATCH 2/5] Resolve migration conflicts Rename the pipeline name Add the missing pygithub dependency Signed-off-by: ziad hany --- requirements.txt | 1 + vulnerabilities/importers/__init__.py | 4 ++-- .../pipelines/v2_importers/github_issue_pr.py | 6 ++--- .../v2_importers/test_github_issue_pr.py | 23 +++++++++---------- .../expected_advisory_output.json | 15 +++++------- 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/requirements.txt b/requirements.txt index dbda64e0c..dd5f63be8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -126,3 +126,4 @@ wcwidth==0.2.5 websocket-client==0.59.0 yarl==1.7.2 zipp==3.19.1 +PyGithub==2.8.1 diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 2c7f61463..d9b95bc88 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -48,9 +48,9 @@ from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) -from vulnerabilities.pipelines.v2_importers import github_issue_pr as github_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import epss_importer_v2 from vulnerabilities.pipelines.v2_importers import fireeye_importer_v2 +from vulnerabilities.pipelines.v2_importers import github_issue_pr as github_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 @@ -101,6 +101,7 @@ epss_importer_v2.EPSSImporterPipeline, nginx_importer_v2.NginxImporterPipeline, mattermost_importer_v2.MattermostImporterPipeline, + github_issue_pr_v2.GithubPipelineIssuePRPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, @@ -136,6 +137,5 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, - github_issue_pr_v2.GithubPipelineIssuePR, ] ) diff --git a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py index ec33e925e..d5de59740 100644 --- a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py +++ b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py @@ -20,12 +20,12 @@ GITHUB_TOKEN = env.str("GITHUB_TOKEN") -class GithubPipelineIssuePR(VulnerableCodeBaseImporterPipelineV2): +class GithubPipelineIssuePRPipeline(VulnerableCodeBaseImporterPipelineV2): """ Pipeline to collect GitHub issues and PRs related to vulnerabilities. """ - pipeline_id = "collect_issues_pr" + pipeline_id = "collect_github_issues_pr" @classmethod def steps(cls): @@ -86,7 +86,7 @@ def collect_advisories(self): yield AdvisoryData( advisory_id=vuln_id, - aliases=[vuln_id], + aliases=[], references_v2=references, url=self.repo_url, ) diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py index e2b80f00f..a3dc57d87 100644 --- a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py +++ b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py @@ -14,13 +14,15 @@ import pytest -from vulnerabilities.pipelines.v2_importers.github_issue_pr import GithubPipelineIssuePR +from vulnerabilities.pipelines.v2_importers.github_issue_pr import GithubPipelineIssuePRPipeline from vulnerabilities.tests import util_tests +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "github_issue_pr" + @pytest.fixture def pipeline(): - pipeline = GithubPipelineIssuePR() + pipeline = GithubPipelineIssuePRPipeline() pipeline.repo_url = "https://github.com/test/repo" pipeline.log = MagicMock() return pipeline @@ -32,12 +34,12 @@ def test_collect_issues_and_prs(pipeline): SimpleNamespace( title="Fix for CVE-2023-1234 found", body="This resolves a security issue", - html_url="http://example.com/issue1", + html_url="https://example.com/issue1", ), SimpleNamespace( title="No vulnerability mentioned", body="This is unrelated", - html_url="http://example.com/issue2", + html_url="https://example.com/issue2", ), ] @@ -45,23 +47,20 @@ def test_collect_issues_and_prs(pipeline): SimpleNamespace( title="Patch addressing GHSA-zzz-111", body="Also fixes PYSEC-2024-5678", - html_url="http://example.com/pr1", + html_url="https://example.com/pr1", ) ] result = pipeline.collect_issues_and_prs() expected = { - "CVE-2023-1234": [("Issue", "http://example.com/issue1")], - "GHSA-zzz-111": [("PR", "http://example.com/pr1")], - "PYSEC-2024-5678": [("PR", "http://example.com/pr1")], + "CVE-2023-1234": [("Issue", "https://example.com/issue1")], + "GHSA-zzz-111": [("PR", "https://example.com/pr1")], + "PYSEC-2024-5678": [("PR", "https://example.com/pr1")], } assert result == expected -TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "github_issue_pr" - - @pytest.mark.django_db def test_collect_advisories_from_json(): input_file = TEST_DATA / "issues_and_pr.json" @@ -69,7 +68,7 @@ def test_collect_advisories_from_json(): issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) - pipeline = GithubPipelineIssuePR() + pipeline = GithubPipelineIssuePRPipeline() pipeline.repo_url = "https://github.com/test/repo" pipeline.log = MagicMock() diff --git a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json index 3ac486d96..525261280 100644 --- a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json +++ b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json @@ -1,9 +1,7 @@ [ { "advisory_id": "CVE-2023-1234", - "aliases": [ - "CVE-2023-1234" - ], + "aliases": [], "summary": "", "affected_packages": [], "references_v2": [ @@ -18,6 +16,7 @@ "url": "https://example.com/pr1" } ], + "patches": [], "severities": [], "date_published": null, "weaknesses": [], @@ -25,9 +24,7 @@ }, { "advisory_id": "GHSA-zzz-111", - "aliases": [ - "GHSA-zzz-111" - ], + "aliases": [], "summary": "", "affected_packages": [], "references_v2": [ @@ -37,6 +34,7 @@ "url": "https://example.com/pr1" } ], + "patches": [], "severities": [], "date_published": null, "weaknesses": [], @@ -44,9 +42,7 @@ }, { "advisory_id": "PYSEC-2024-5678", - "aliases": [ - "PYSEC-2024-5678" - ], + "aliases": [], "summary": "", "affected_packages": [], "references_v2": [ @@ -56,6 +52,7 @@ "url": "https://example.com/pr1" } ], + "patches": [], "severities": [], "date_published": null, "weaknesses": [], From 94cc08bc8a2cdad68a7cca5e7a4f3b5e49e5173a Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 24 Jan 2026 00:32:14 +0200 Subject: [PATCH 3/5] Add the missing pygithub dependency to setup.cfg Signed-off-by: ziad hany --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index c104497ab..fb9433d6d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,6 +90,7 @@ install_requires = # networking GitPython>=3.1.17 + PyGithub>=2.8.1 requests>=2.25.1 fetchcode>=0.6.0 From 575fedc93671b9ce47fee17629a24e16eb2de000 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 24 Jan 2026 00:45:40 +0200 Subject: [PATCH 4/5] Fix CI & resolve dependency conflict Signed-off-by: ziad hany --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index dd5f63be8..e53583c89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -126,4 +126,4 @@ wcwidth==0.2.5 websocket-client==0.59.0 yarl==1.7.2 zipp==3.19.1 -PyGithub==2.8.1 +PyGithub==2.6.1 diff --git a/setup.cfg b/setup.cfg index fb9433d6d..7016aa57e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -90,7 +90,7 @@ install_requires = # networking GitPython>=3.1.17 - PyGithub>=2.8.1 + PyGithub>=2.6.1 requests>=2.25.1 fetchcode>=0.6.0 From 6318895a68cf38f8d33a2a807a17cbee290309f2 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Mon, 26 Jan 2026 13:47:45 +0200 Subject: [PATCH 5/5] Refactor the pipeline for collecting GitHub/GitLab issues and PRS Signed-off-by: ziad hany --- requirements.txt | 1 + vulnerabilities/importers/__init__.py | 6 +- vulnerabilities/pipelines/__init__.py | 104 +++++++++++++++ .../v2_importers/collect_issue_pr.py | 20 +++ .../pipelines/v2_importers/github_issue_pr.py | 92 ------------- .../v2_importers/test_collect_issue_pr.py | 126 ++++++++++++++++++ .../v2_importers/test_github_issue_pr.py | 79 ----------- .../collect_issue_pr/expected_github.json | 1 + .../collect_issue_pr/expected_gitlab.json | 1 + .../github_issues_and_pr.json} | 0 .../gitlab_issues_and_pr.json | 1 + .../expected_advisory_output.json | 61 --------- 12 files changed, 258 insertions(+), 234 deletions(-) create mode 100644 vulnerabilities/pipelines/v2_importers/collect_issue_pr.py delete mode 100644 vulnerabilities/pipelines/v2_importers/github_issue_pr.py create mode 100644 vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py delete mode 100644 vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py create mode 100644 vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json create mode 100644 vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json rename vulnerabilities/tests/test_data/{github_issue_pr/issues_and_pr.json => collect_issue_pr/github_issues_and_pr.json} (100%) create mode 100644 vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json delete mode 100644 vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json diff --git a/requirements.txt b/requirements.txt index e53583c89..19936b166 100644 --- a/requirements.txt +++ b/requirements.txt @@ -127,3 +127,4 @@ websocket-client==0.59.0 yarl==1.7.2 zipp==3.19.1 PyGithub==2.6.1 +python-gitlab~=7.1.0 \ No newline at end of file diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index d9b95bc88..758ab4192 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -44,13 +44,13 @@ from vulnerabilities.pipelines.v2_importers import aosp_importer as aosp_importer_v2 from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2 +from vulnerabilities.pipelines.v2_importers import collect_issue_pr as collect_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) from vulnerabilities.pipelines.v2_importers import epss_importer_v2 from vulnerabilities.pipelines.v2_importers import fireeye_importer_v2 -from vulnerabilities.pipelines.v2_importers import github_issue_pr as github_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 @@ -101,7 +101,7 @@ epss_importer_v2.EPSSImporterPipeline, nginx_importer_v2.NginxImporterPipeline, mattermost_importer_v2.MattermostImporterPipeline, - github_issue_pr_v2.GithubPipelineIssuePRPipeline, + collect_issue_pr_v2.CollectIssuePRPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, @@ -137,5 +137,7 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + collect_issue_pr_v2.CollectKubernetesPRSIssues, + collect_issue_pr_v2.CollectWiresharkPRSIssues, ] ) diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index 9efd58c05..dddb1bb6d 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -8,25 +8,33 @@ # import logging +import re import traceback +from abc import abstractmethod +from collections import defaultdict from datetime import datetime from datetime import timezone from timeit import default_timer as timer from traceback import format_exc as traceback_format_exc from typing import Iterable from typing import List +from urllib.parse import urlparse +import gitlab from aboutcode.pipeline import LoopProgress from aboutcode.pipeline import PipelineDefinition from aboutcode.pipeline import humanize_time +from github import Github from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 from vulnerabilities.improver import MAX_CONFIDENCE from vulnerabilities.models import Advisory from vulnerabilities.models import PipelineRun from vulnerabilities.pipes.advisory import import_advisory from vulnerabilities.pipes.advisory import insert_advisory from vulnerabilities.pipes.advisory import insert_advisory_v2 +from vulnerablecode.settings import env module_logger = logging.getLogger(__name__) @@ -321,3 +329,99 @@ def collect_and_store_advisories(self): continue self.log(f"Successfully collected {collected_advisory_count:,d} advisories") + + +class VCSCollector(VulnerableCodeBaseImporterPipeline): + """ + Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities. + """ + + vcs_url: str + CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) + SUPPORTED_IDENTIFIERS = ["CVE-"] + + collected_items: dict = {} + + def advisories_count(self) -> int: + return 0 + + @classmethod + def steps(cls): + return ( + cls.configure_target, + cls.fetch_entries, + cls.collect_items, + cls.collect_and_store_advisories, + ) + + def configure_target(self): + parsed_url = urlparse(self.repo_url) + parts = parsed_url.path.strip("/").split("/") + if len(parts) < 2: + raise ValueError(f"Invalid URL: {self.repo_url}") + + self.repo_name = f"{parts[0]}/{parts[1]}" + + @abstractmethod + def fetch_entries(self): + raise NotImplementedError + + @abstractmethod + def collect_items(self): + raise NotImplementedError + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub/Gitlab issues and PRs. + """ + self.log("Generating AdvisoryData objects from GitHub/Gitlab issues and PRs.") + for vuln_id, refs in self.collected_items.items(): + print(vuln_id, refs) + references = [ReferenceV2(reference_type=ref_id, url=url) for ref_id, url in refs] + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[], + references_v2=references, + url=self.repo_url, + ) + + +class GitHubCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitHub Data Entries""" + github_token = env.str("GITHUB_TOKEN") + g = Github(login_or_token=github_token) + base_query = f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" + self.issues = g.search_issues(f"{base_query} is:issue") + self.prs = g.search_issues(f"{base_query} is:pr") + + def collect_items(self): + self.collected_items = defaultdict(list) + + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for item in items: + matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) + for match in matches: + self.collected_items[match].append(("Issue", item.html_url)) + + +class GitLabCollector(VCSCollector): + def fetch_entries(self): + """Fetch GitLab Data Entries""" + gitlab_token = env.str("GITLAB_TOKEN") + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) + project = gl.projects.get(self.repo_name) + base_query = " ".join(self.SUPPORTED_IDENTIFIERS) + self.issues = project.search(scope="issues", search=base_query) + self.prs = project.search(scope="merge_requests", search=base_query) + + def collect_items(self): + self.collected_items = defaultdict(list) + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: + for item in items: + title = item.get("title") or "" + description = item.get("description") or "" + matches = self.CVE_PATTERN.findall(title + " " + description) + for match in matches: + url = item.get("web_url") + self.collected_items[match].append((i_type, url)) diff --git a/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py b/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py new file mode 100644 index 000000000..85b4bc621 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_issue_pr.py @@ -0,0 +1,20 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +from vulnerabilities.pipelines import GitHubCollector +from vulnerabilities.pipelines import GitLabCollector + + +class CollectKubernetesPRSIssues(GitHubCollector): + pipeline_id = "collect-kubernetes-prs-issues" + repo_url = "https://github.com/kubernetes/kubernetes" + + +class CollectWiresharkPRSIssues(GitLabCollector): + pipeline_id = "collect-wireshark-prs-issues" + repo_url = "https://gitlab.com/wireshark/wireshark" diff --git a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py deleted file mode 100644 index d5de59740..000000000 --- a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py +++ /dev/null @@ -1,92 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# VulnerableCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import re -from collections import defaultdict - -from github import Github - -from vulnerabilities.importer import AdvisoryData -from vulnerabilities.importer import ReferenceV2 -from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 -from vulnerablecode.settings import env - -GITHUB_TOKEN = env.str("GITHUB_TOKEN") - - -class GithubPipelineIssuePRPipeline(VulnerableCodeBaseImporterPipelineV2): - """ - Pipeline to collect GitHub issues and PRs related to vulnerabilities. - """ - - pipeline_id = "collect_github_issues_pr" - - @classmethod - def steps(cls): - return ( - cls.fetch_entries, - cls.collect_and_store_advisories, - ) - - def fetch_entries(self): - """Clone the repository.""" - self.repo_url = "https://github.com/torvalds/linux" - repo_name = "django/django" - - g = Github(login_or_token=GITHUB_TOKEN) - - base_query = f"repo:{repo_name} (CVE OR PYSEC OR GHSA)" - self.issues = g.search_issues(f"{base_query} is:issue") - self.pull_requestes = g.search_issues(f"{base_query} is:pr") - - def advisories_count(self) -> int: - """ - Return total number of advisories discovered (issues + PRs). - """ - return self.issues.totalCount + self.pull_requestes.totalCount - - def collect_issues_and_prs(self): - """ - Group issues and PRs by vulnerability identifiers (like CVE-xxxx-yyyy). - Returns a dict mapping vuln_id -> [(type, html_url)]. - """ - self.log("Grouping GitHub issues and PRs by vulnerability identifiers.") - - grouped_items = defaultdict(list) - pattern = re.compile(r"(CVE-\d{4}-\d+|PYSEC-\d{4}-\d+|GHSA-[\w-]+)", re.IGNORECASE) - - for issue in self.issues: - matches = pattern.findall(issue.title + " " + (issue.body or "")) - for match in matches: - grouped_items[match].append(("Issue", issue.html_url)) - - for pr in self.pull_requestes: - matches = pattern.findall(pr.title + " " + (pr.body or "")) - for match in matches: - grouped_items[match].append(("PR", pr.html_url)) - - self.log(f"Grouped {len(grouped_items)} unique vulnerability identifiers.") - return grouped_items - - def collect_advisories(self): - """ - Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub issues and PRs. - """ - self.log("Generating AdvisoryData objects from GitHub issues and PRs.") - grouped_data = self.collect_issues_and_prs() - - for vuln_id, refs in grouped_data.items(): - references = [ReferenceV2(reference_id=ref_id, url=url) for ref_id, url in refs] - - yield AdvisoryData( - advisory_id=vuln_id, - aliases=[], - references_v2=references, - url=self.repo_url, - ) diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py new file mode 100644 index 000000000..41e8477ba --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_issue_pr.py @@ -0,0 +1,126 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from vulnerabilities.pipelines import GitHubCollector +from vulnerabilities.pipelines import GitLabCollector +from vulnerabilities.tests import util_tests + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "collect_issue_pr" + + +@pytest.mark.django_db +def test_collect_github_issues_and_prs(): + pipeline = GitHubCollector() + pipeline.issues = [ + SimpleNamespace( + title="Fix the CVE-2023-1234 found", + body="This resolves a security issue", + html_url="https://github.com/issue1", + ), + SimpleNamespace( + title="vulnerability 1", + body="Fix CVE-2023-124", + html_url="https://github.com/issue2", + ), + SimpleNamespace( + title="vulnerability 2", + body="vulnerability 2", + html_url="https://github.com/issue3", + ), + ] + + pipeline.prs = [ + SimpleNamespace( + title="Patch addressing CVE-2023-1234", + body="Also fixes CVE-2023-1234", + html_url="https://github.com/pr1", + ) + ] + + pipeline.collect_items() + expected = { + "CVE-2023-1234": [ + ("Issue", "https://github.com/issue1"), + ("Issue", "https://github.com/pr1"), + ("Issue", "https://github.com/pr1"), + ], + "CVE-2023-124": [("Issue", "https://github.com/issue2")], + } + + assert pipeline.collected_items == expected + + +@pytest.mark.django_db +def test_collect_gitlab_issues_and_prs(): + pipeline = GitLabCollector() + pipeline.issues = [ + { + "title": "vulnerability CVE-2024-1234", + "description": "vulnerability 1", + "web_url": "https://github.com/issue1", + }, + ] + + pipeline.prs = [ + { + "title": "Patch addressing", + "description": "Also fixes CVE-2023-1234", + "web_url": "https://github.com/pr1", + } + ] + + pipeline.collect_items() + expected = { + "CVE-2024-1234": [("Issue", "https://github.com/issue1")], + "CVE-2023-1234": [("PR", "https://github.com/pr1")], + } + + assert pipeline.collected_items == expected + + +@pytest.mark.parametrize( + "input_file, expected_file, repo_url, pipeline_class", + [ + ( + "github_issues_and_pr.json", + "expected_github.json", + "https://github.com/test/repo", + GitHubCollector, + ), + ( + "gitlab_issues_and_pr.json", + "expected_gitlab.json", + "https://gitlab.com/test/repo", + GitLabCollector, + ), + ], +) +@pytest.mark.django_db +def test_collect_advisories_from_json(input_file, expected_file, repo_url, pipeline_class): + input_file = TEST_DATA / input_file + expected_file = TEST_DATA / expected_file + + issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = pipeline_class() + pipeline.pipeline_id = "collect-prs-issues" + pipeline.repo_url = repo_url + pipeline.log = MagicMock() + + pipeline.collect_items = MagicMock(return_value=issues_and_prs) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + util_tests.check_results_against_json(result, expected_file) diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py deleted file mode 100644 index a3dc57d87..000000000 --- a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py +++ /dev/null @@ -1,79 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# VulnerableCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -from pathlib import Path -from types import SimpleNamespace -from unittest.mock import MagicMock - -import pytest - -from vulnerabilities.pipelines.v2_importers.github_issue_pr import GithubPipelineIssuePRPipeline -from vulnerabilities.tests import util_tests - -TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "github_issue_pr" - - -@pytest.fixture -def pipeline(): - pipeline = GithubPipelineIssuePRPipeline() - pipeline.repo_url = "https://github.com/test/repo" - pipeline.log = MagicMock() - return pipeline - - -@pytest.mark.django_db -def test_collect_issues_and_prs(pipeline): - pipeline.issues = [ - SimpleNamespace( - title="Fix for CVE-2023-1234 found", - body="This resolves a security issue", - html_url="https://example.com/issue1", - ), - SimpleNamespace( - title="No vulnerability mentioned", - body="This is unrelated", - html_url="https://example.com/issue2", - ), - ] - - pipeline.pull_requestes = [ - SimpleNamespace( - title="Patch addressing GHSA-zzz-111", - body="Also fixes PYSEC-2024-5678", - html_url="https://example.com/pr1", - ) - ] - - result = pipeline.collect_issues_and_prs() - expected = { - "CVE-2023-1234": [("Issue", "https://example.com/issue1")], - "GHSA-zzz-111": [("PR", "https://example.com/pr1")], - "PYSEC-2024-5678": [("PR", "https://example.com/pr1")], - } - - assert result == expected - - -@pytest.mark.django_db -def test_collect_advisories_from_json(): - input_file = TEST_DATA / "issues_and_pr.json" - expected_file = TEST_DATA / "expected_advisory_output.json" - - issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) - - pipeline = GithubPipelineIssuePRPipeline() - pipeline.repo_url = "https://github.com/test/repo" - pipeline.log = MagicMock() - - pipeline.collect_issues_and_prs = MagicMock(return_value=issues_and_prs) - - result = [adv.to_dict() for adv in pipeline.collect_advisories()] - - util_tests.check_results_against_json(result, expected_file) diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json b/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/expected_github.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json b/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/expected_gitlab.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json b/vulnerabilities/tests/test_data/collect_issue_pr/github_issues_and_pr.json similarity index 100% rename from vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json rename to vulnerabilities/tests/test_data/collect_issue_pr/github_issues_and_pr.json diff --git a/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json b/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/vulnerabilities/tests/test_data/collect_issue_pr/gitlab_issues_and_pr.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json deleted file mode 100644 index 525261280..000000000 --- a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json +++ /dev/null @@ -1,61 +0,0 @@ -[ - { - "advisory_id": "CVE-2023-1234", - "aliases": [], - "summary": "", - "affected_packages": [], - "references_v2": [ - { - "reference_id": "Issue", - "reference_type": "", - "url": "https://example.com/issue1" - }, - { - "reference_id": "PR", - "reference_type": "", - "url": "https://example.com/pr1" - } - ], - "patches": [], - "severities": [], - "date_published": null, - "weaknesses": [], - "url": "https://github.com/test/repo" - }, - { - "advisory_id": "GHSA-zzz-111", - "aliases": [], - "summary": "", - "affected_packages": [], - "references_v2": [ - { - "reference_id": "PR", - "reference_type": "", - "url": "https://example.com/pr1" - } - ], - "patches": [], - "severities": [], - "date_published": null, - "weaknesses": [], - "url": "https://github.com/test/repo" - }, - { - "advisory_id": "PYSEC-2024-5678", - "aliases": [], - "summary": "", - "affected_packages": [], - "references_v2": [ - { - "reference_id": "PR", - "reference_type": "", - "url": "https://example.com/pr1" - } - ], - "patches": [], - "severities": [], - "date_published": null, - "weaknesses": [], - "url": "https://github.com/test/repo" - } -] \ No newline at end of file