From 0fe9ffa5a0144d41fcc5ff4e43f17f89f5e27f17 Mon Sep 17 00:00:00 2001 From: shivamshrma09 Date: Fri, 20 Mar 2026 14:37:52 +0530 Subject: [PATCH] feat: add alias-based relating and summary similarity heuristics for advisory grouping Signed-off-by: shivamshrma09 --- .../0117_add_alias_and_summary_issue_types.py | 42 +++++ vulnerabilities/models.py | 5 + .../v2_improvers/compute_advisory_todo.py | 172 ++++++++++++++++++ .../test_compute_advisory_todo_v2.py | 169 +++++++++++++++++ 4 files changed, 388 insertions(+) create mode 100644 vulnerabilities/migrations/0117_add_alias_and_summary_issue_types.py diff --git a/vulnerabilities/migrations/0117_add_alias_and_summary_issue_types.py b/vulnerabilities/migrations/0117_add_alias_and_summary_issue_types.py new file mode 100644 index 000000000..ae731432f --- /dev/null +++ b/vulnerabilities/migrations/0117_add_alias_and_summary_issue_types.py @@ -0,0 +1,42 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from django.db import migrations +from django.db import models +import vulnerabilities.models + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0116_advisoryv2_advisory_content_hash"), + ] + + operations = [ + migrations.AlterField( + model_name="advisorytodo", + name="issue_type", + field=models.CharField( + choices=vulnerabilities.models.ISSUE_TYPE_CHOICES, + db_index=True, + help_text="Select the issue that needs to be addressed from the available options.", + max_length=50, + ), + ), + migrations.AlterField( + model_name="advisorytodov2", + name="issue_type", + field=models.CharField( + choices=vulnerabilities.models.ISSUE_TYPE_CHOICES, + db_index=True, + help_text="Select the issue that needs to be addressed from the available options.", + max_length=50, + ), + ), + ] diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index d1c88f285..65ffde3ec 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -2438,6 +2438,11 @@ def create_new_job(self, execute_now=False): "Advisories have conflicting affected and fixed-by packages", ), ("CONFLICTING_SEVERITY_SCORES", "Advisories have conflicting severity scores"), + ( + "POTENTIALLY_RELATED_BY_ALIASES", + "Advisories are potentially related by shared aliases", + ), + ("SIMILAR_SUMMARIES", "Advisories have similar summaries"), ] diff --git a/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py b/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py index 981f10e92..247cc1d78 100644 --- a/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py +++ b/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py @@ -8,7 +8,9 @@ # +import difflib import json +from itertools import combinations from aboutcode.pipeline import LoopProgress from django.utils import timezone @@ -20,6 +22,8 @@ from vulnerabilities.pipelines import VulnerableCodePipeline from vulnerabilities.pipes.advisory import advisories_checksum +SUMMARY_SIMILARITY_THRESHOLD = 0.8 + class ComputeToDo(VulnerableCodePipeline): """Compute ToDos for Advisory.""" @@ -31,6 +35,8 @@ def steps(cls): return ( cls.compute_individual_advisory_todo, cls.detect_conflicting_advisories, + cls.relate_advisories_by_aliases, + cls.detect_similar_summaries, ) def compute_individual_advisory_todo(self): @@ -144,6 +150,115 @@ def detect_conflicting_advisories(self): f"Successfully created {new_todos_count} ToDos for conflicting affected and fixed packages" ) + def relate_advisories_by_aliases(self): + """ + Create ToDos for advisories from different datasources that share the same alias. + """ + aliases = AdvisoryAlias.objects.prefetch_related("advisories") + aliases_count = aliases.count() + advisory_relation_to_create = {} + todo_to_create = [] + new_todos_count = 0 + batch_size = 5000 + + self.log(f"Checking alias-based relations across {aliases_count} aliases") + + progress = LoopProgress( + total_iterations=aliases_count, + logger=self.log, + progress_step=1, + ) + for alias in progress.iter(aliases.iterator(chunk_size=2000)): + advisories = list( + alias.advisories.values("id", "datasource_id", "unique_content_id") + ) + + datasources = {a["datasource_id"] for a in advisories} + if len(datasources) < 2: + continue + + advisory_objs = list(alias.advisories.all()) + check_potentially_related_by_aliases( + advisories=advisory_objs, + alias=alias, + todo_to_create=todo_to_create, + advisory_relation_to_create=advisory_relation_to_create, + ) + + if len(todo_to_create) > batch_size: + new_todos_count += bulk_create_with_m2m( + todos=todo_to_create, + advisories=advisory_relation_to_create, + logger=self.log, + ) + advisory_relation_to_create.clear() + todo_to_create.clear() + + new_todos_count += bulk_create_with_m2m( + todos=todo_to_create, + advisories=advisory_relation_to_create, + logger=self.log, + ) + + self.log( + f"Successfully created {new_todos_count} ToDos for potentially related advisories by aliases" + ) + + def detect_similar_summaries(self): + """ + Create ToDos for advisories from different datasources that share the same alias + and have summaries with similarity above SUMMARY_SIMILARITY_THRESHOLD. + """ + aliases = AdvisoryAlias.objects.prefetch_related("advisories") + aliases_count = aliases.count() + advisory_relation_to_create = {} + todo_to_create = [] + new_todos_count = 0 + batch_size = 5000 + + self.log(f"Checking summary similarity across {aliases_count} aliases") + + progress = LoopProgress( + total_iterations=aliases_count, + logger=self.log, + progress_step=1, + ) + for alias in progress.iter(aliases.iterator(chunk_size=2000)): + advisory_objs = list( + alias.advisories.exclude(summary="").only( + "id", "datasource_id", "summary", "unique_content_id" + ) + ) + + datasources = {a.datasource_id for a in advisory_objs} + if len(datasources) < 2: + continue + + check_similar_summaries( + advisories=advisory_objs, + todo_to_create=todo_to_create, + advisory_relation_to_create=advisory_relation_to_create, + ) + + if len(todo_to_create) > batch_size: + new_todos_count += bulk_create_with_m2m( + todos=todo_to_create, + advisories=advisory_relation_to_create, + logger=self.log, + ) + advisory_relation_to_create.clear() + todo_to_create.clear() + + new_todos_count += bulk_create_with_m2m( + todos=todo_to_create, + advisories=advisory_relation_to_create, + logger=self.log, + ) + + self.log( + f"Successfully created {new_todos_count} ToDos for advisories with similar summaries" + ) + def check_missing_summary( advisory: AdvisoryV2, @@ -351,3 +466,60 @@ def bulk_create_with_m2m(todos, advisories, logger): logger(f"Error creating Advisory ToDo relations: {e}") return new_todos.count() + + +def check_potentially_related_by_aliases( + advisories, + alias, + todo_to_create, + advisory_relation_to_create, +): + """ + Create a POTENTIALLY_RELATED_BY_ALIASES ToDo for advisories from different + datasources that share the same alias. + """ + todo_id = advisories_checksum(advisories) + todo = AdvisoryToDoV2( + related_advisories_id=todo_id, + issue_type="POTENTIALLY_RELATED_BY_ALIASES", + issue_detail=json.dumps({"shared_alias": str(alias)}), + ) + todo_to_create.append(todo) + advisory_relation_to_create[todo_id] = advisories + + +def check_similar_summaries( + advisories, + todo_to_create, + advisory_relation_to_create, +): + """ + Create SIMILAR_SUMMARIES ToDos for pairs of advisories from different datasources + whose summaries have a similarity ratio above SUMMARY_SIMILARITY_THRESHOLD. + """ + for advisory_a, advisory_b in combinations(advisories, 2): + if advisory_a.datasource_id == advisory_b.datasource_id: + continue + + ratio = difflib.SequenceMatcher( + None, advisory_a.summary, advisory_b.summary + ).ratio() + + if ratio < SUMMARY_SIMILARITY_THRESHOLD: + continue + + pair = [advisory_a, advisory_b] + todo_id = advisories_checksum(pair) + todo = AdvisoryToDoV2( + related_advisories_id=todo_id, + issue_type="SIMILAR_SUMMARIES", + issue_detail=json.dumps( + { + "similarity_score": round(ratio, 4), + "datasource_a": advisory_a.datasource_id, + "datasource_b": advisory_b.datasource_id, + } + ), + ) + todo_to_create.append(todo) + advisory_relation_to_create[todo_id] = pair diff --git a/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py b/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py index 3c234db54..6cfbbbbe7 100644 --- a/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py +++ b/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py @@ -8,6 +8,7 @@ # from datetime import datetime +from datetime import timezone from django.test import TestCase from packageurl import PackageURL @@ -206,3 +207,171 @@ def test_advisory_todo_conflicting_fixed_affected(self): ) self.assertEqual(2, todo.advisories.count()) self.assertEqual(todo, adv2.advisory_todos.first()) + + def test_relate_advisories_by_aliases_creates_todo(self): + """Two advisories from different datasources sharing an alias get flagged.""" + alias = AdvisoryAlias.objects.create(alias="CVE-2021-9999") + date = datetime.now(timezone.utc) + + adv1 = AdvisoryV2.objects.create( + unique_content_id="alias_test_id1", + url="https://example.com/1", + summary="A vulnerability in foo", + date_collected=date, + advisory_id="CVE-2021-9999", + avid="nvd_importer/CVE-2021-9999", + datasource_id="nvd_importer", + ) + adv1.aliases.add(alias) + + adv2 = AdvisoryV2.objects.create( + unique_content_id="alias_test_id2", + url="https://example.com/2", + summary="A vulnerability in foo package", + date_collected=date, + advisory_id="CVE-2021-9999", + avid="github_osv_importer/CVE-2021-9999", + datasource_id="github_osv_importer", + ) + adv2.aliases.add(alias) + + pipeline = ComputeToDo() + pipeline.execute() + + todos = AdvisoryToDoV2.objects.filter(issue_type="POTENTIALLY_RELATED_BY_ALIASES") + self.assertEqual(1, todos.count()) + self.assertEqual(2, todos.first().advisories.count()) + + def test_relate_advisories_by_aliases_same_datasource_not_flagged(self): + """Two advisories from the same datasource sharing an alias are not flagged.""" + alias = AdvisoryAlias.objects.create(alias="CVE-2021-8888") + date = datetime.now(timezone.utc) + + adv1 = AdvisoryV2.objects.create( + unique_content_id="same_ds_id1", + url="https://example.com/1", + summary="Vulnerability in bar", + date_collected=date, + advisory_id="CVE-2021-8888", + avid="nvd_importer/CVE-2021-8888-1", + datasource_id="nvd_importer", + ) + adv1.aliases.add(alias) + + adv2 = AdvisoryV2.objects.create( + unique_content_id="same_ds_id2", + url="https://example.com/2", + summary="Vulnerability in bar package", + date_collected=date, + advisory_id="CVE-2021-8888", + avid="nvd_importer/CVE-2021-8888-2", + datasource_id="nvd_importer", + ) + adv2.aliases.add(alias) + + pipeline = ComputeToDo() + pipeline.execute() + + todos = AdvisoryToDoV2.objects.filter(issue_type="POTENTIALLY_RELATED_BY_ALIASES") + self.assertEqual(0, todos.count()) + + def test_detect_similar_summaries_creates_todo(self): + """Two advisories from different datasources with similar summaries get flagged.""" + alias = AdvisoryAlias.objects.create(alias="CVE-2021-7777") + date = datetime.now(timezone.utc) + + adv1 = AdvisoryV2.objects.create( + unique_content_id="sim_sum_id1", + url="https://example.com/1", + summary="Buffer overflow in nginx version 1.2 allows remote code execution", + date_collected=date, + advisory_id="CVE-2021-7777", + avid="nvd_importer/CVE-2021-7777", + datasource_id="nvd_importer", + ) + adv1.aliases.add(alias) + + adv2 = AdvisoryV2.objects.create( + unique_content_id="sim_sum_id2", + url="https://example.com/2", + summary="Buffer overflow in nginx version 1.2 allows remote code execution.", + date_collected=date, + advisory_id="CVE-2021-7777", + avid="debian_importer_v2/CVE-2021-7777", + datasource_id="debian_importer_v2", + ) + adv2.aliases.add(alias) + + pipeline = ComputeToDo() + pipeline.execute() + + todos = AdvisoryToDoV2.objects.filter(issue_type="SIMILAR_SUMMARIES") + self.assertEqual(1, todos.count()) + self.assertEqual(2, todos.first().advisories.count()) + self.assertIn("similarity_score", todos.first().issue_detail) + + def test_detect_similar_summaries_below_threshold_not_flagged(self): + """Two advisories with very different summaries are not flagged.""" + alias = AdvisoryAlias.objects.create(alias="CVE-2021-6666") + date = datetime.now(timezone.utc) + + adv1 = AdvisoryV2.objects.create( + unique_content_id="diff_sum_id1", + url="https://example.com/1", + summary="Buffer overflow in nginx allows remote code execution", + date_collected=date, + advisory_id="CVE-2021-6666", + avid="nvd_importer/CVE-2021-6666", + datasource_id="nvd_importer", + ) + adv1.aliases.add(alias) + + adv2 = AdvisoryV2.objects.create( + unique_content_id="diff_sum_id2", + url="https://example.com/2", + summary="SQL injection vulnerability in Django ORM affects all versions before 3.2", + date_collected=date, + advisory_id="CVE-2021-6666", + avid="debian_importer_v2/CVE-2021-6666", + datasource_id="debian_importer_v2", + ) + adv2.aliases.add(alias) + + pipeline = ComputeToDo() + pipeline.execute() + + todos = AdvisoryToDoV2.objects.filter(issue_type="SIMILAR_SUMMARIES") + self.assertEqual(0, todos.count()) + + def test_detect_similar_summaries_empty_summary_skipped(self): + """Advisories with empty summaries are not compared for similarity.""" + alias = AdvisoryAlias.objects.create(alias="CVE-2021-5555") + date = datetime.now(timezone.utc) + + adv1 = AdvisoryV2.objects.create( + unique_content_id="empty_sum_id1", + url="https://example.com/1", + summary="", + date_collected=date, + advisory_id="CVE-2021-5555", + avid="nvd_importer/CVE-2021-5555", + datasource_id="nvd_importer", + ) + adv1.aliases.add(alias) + + adv2 = AdvisoryV2.objects.create( + unique_content_id="empty_sum_id2", + url="https://example.com/2", + summary="Buffer overflow in nginx", + date_collected=date, + advisory_id="CVE-2021-5555", + avid="debian_importer_v2/CVE-2021-5555", + datasource_id="debian_importer_v2", + ) + adv2.aliases.add(alias) + + pipeline = ComputeToDo() + pipeline.execute() + + todos = AdvisoryToDoV2.objects.filter(issue_type="SIMILAR_SUMMARIES") + self.assertEqual(0, todos.count())