Skip to content

Commit 5b3133a

Browse files
authored
Improve exception support in improve_packages_from_purldb task #303 (#304)
Signed-off-by: tdruez <tdruez@nexb.com>
1 parent 5bd72e6 commit 5b3133a

File tree

17 files changed

+663
-376
lines changed

17 files changed

+663
-376
lines changed

CHANGELOG.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,18 @@ Release notes
137137
prevent the creation of duplicated "resolved" dependencies.
138138
https://github.com/aboutcode-org/dejacode/issues/297
139139

140+
- Display the filename/download_url in the Inventory tab.
141+
https://github.com/aboutcode-org/dejacode/issues/303
142+
143+
- Improve exception support in improve_packages_from_purldb task.
144+
In case of an exception, the error is properly logged on the Import instance.
145+
https://github.com/aboutcode-org/dejacode/issues/303
146+
147+
- Refine the ``update_from_purldb`` function to avoid any IntegrityError.
148+
Also, when multiple entries are returned from the PurlDB, only the common values are
149+
merged and kept for the data update.
150+
https://github.com/aboutcode-org/dejacode/issues/303
151+
140152
### Version 5.2.1
141153

142154
- Fix the models documentation navigation.

component_catalog/models.py

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from dje.models import ReferenceNotesMixin
7373
from dje.tasks import logger as tasks_logger
7474
from dje.utils import is_purl_str
75+
from dje.utils import merge_common_non_empty_values
7576
from dje.utils import set_fields_from_object
7677
from dje.validators import generic_uri_validator
7778
from dje.validators import validate_url_segment
@@ -2454,6 +2455,7 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10):
24542455
is nothing was found.
24552456
"""
24562457
payloads = []
2458+
purldb_entries = []
24572459

24582460
package_url = self.package_url
24592461
if package_url:
@@ -2468,30 +2470,77 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10):
24682470
if max_request_call and index >= max_request_call:
24692471
return
24702472

2471-
if packages_data := purldb.find_packages(payload, timeout):
2472-
return packages_data
2473+
if purldb_entries := purldb.find_packages(payload, timeout):
2474+
break
2475+
2476+
# Cleanup the PurlDB entries:
2477+
# - Packages with different PURL are excluded.
2478+
if package_url:
2479+
purldb_entries = [entry for entry in purldb_entries if entry.get("purl") == package_url]
2480+
2481+
return purldb_entries
24732482

24742483
def update_from_purldb(self, user):
24752484
"""
2476-
Find this Package in the PurlDB and update empty fields with PurlDB data
2477-
when available.
2485+
Update this Package instance with data from PurlDB.
2486+
2487+
- Retrieves matching entries from PurlDB using the given user.
2488+
- If exactly one match is found, its data is used directly.
2489+
- If multiple entries are found, only values that are non-empty and
2490+
common across all entries are merged and used to update the Package.
24782491
"""
24792492
purldb_entries = self.get_purldb_entries(user)
24802493
if not purldb_entries:
24812494
return
24822495

2483-
package_data = purldb_entries[0]
2496+
purldb_entries_count = len(purldb_entries)
2497+
if purldb_entries_count == 1:
2498+
package_data = purldb_entries[0]
2499+
else:
2500+
package_data = merge_common_non_empty_values(purldb_entries)
2501+
24842502
# The format from PURLDB is "2019-11-18T00:00:00Z"
24852503
if release_date := package_data.get("release_date"):
24862504
package_data["release_date"] = release_date.split("T")[0]
24872505
package_data["license_expression"] = package_data.get("declared_license_expression")
24882506

2507+
# Avoid raising an IntegrityError when the values in `package_data` for the
2508+
# identifier fields already exist on another Package instance.
2509+
#
2510+
# This situation can occur when a complete package (with both `purl` and
2511+
# `download_url`) already exists in the Dataspace, and `update_from_purldb` is
2512+
# called on a different package that has the same `purl` but no `download_url`.
2513+
#
2514+
# If we try to assign the same `download_url` to the second package, it would
2515+
# violate the unique constraints defined in the Package model (since the
2516+
# combination of fields must be unique).
2517+
unique_filters_lookups = {
2518+
field_name: package_data.get(field_name, "")
2519+
for field_name in self.get_identifier_fields()
2520+
}
2521+
unique_filters_qs = (
2522+
Package.objects.scope(self.dataspace)
2523+
.filter(**unique_filters_lookups)
2524+
.exclude(pk=self.pk)
2525+
)
2526+
if unique_filters_qs.exists():
2527+
# Remove the problematic "identifier_fields" values and the checksum values
2528+
hash_field_names = [field.name for field in HashFieldsMixin._meta.fields]
2529+
identifier_fields = self.get_identifier_fields()
2530+
for field_name in [*hash_field_names, *identifier_fields]:
2531+
package_data.pop(field_name, None)
2532+
2533+
# try:
24892534
updated_fields = self.update_from_data(
24902535
user,
24912536
package_data,
24922537
override=False,
24932538
override_unknown=True,
24942539
)
2540+
# except IntegrityError as e:
2541+
# logger.error(f"[update_from_purldb] Skipping {self} due to IntegrityError: {e}")
2542+
# return []
2543+
24952544
return updated_fields
24962545

24972546
def update_from_scan(self, user):
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<div title="{{ package.download_url }}"{% if not package.filename %} class="text-truncate"{% endif %}>
2+
{% if package.download_url %}
3+
<a href="{{ package.download_url }}">
4+
{% if display_icons %}
5+
<i class="fa-solid fa-download me-1"></i>
6+
{% endif %}
7+
{% if package.filename %}
8+
{{ package.filename }}
9+
{% else %}
10+
{{ package.download_url|truncatechars:40 }}
11+
{% endif %}
12+
</a>
13+
{% elif package.filename %}
14+
{% if display_icons %}
15+
<i class="fa-solid fa-file me-1"></i>
16+
{% endif %}
17+
{{ package.filename }}
18+
{% endif %}
19+
</div>

component_catalog/templates/component_catalog/tables/package_list_table.html

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,8 @@
5757
<td>
5858
{{ object.primary_language }}
5959
</td>
60-
<td title="{{ object.download_url }}"{% if not object.filename %} class="text-truncate"{% endif %}>
61-
{% if object.download_url %}
62-
<a href="{{ object.download_url }}">
63-
{% if object.filename %}{{ object.filename }}{% else %}{{ object.download_url }}{% endif %}
64-
</a>
65-
{% endif %}
60+
<td>
61+
{% include 'component_catalog/includes/package_filename_as_link.html' with package=object %}
6662
</td>
6763
<td>
6864
{% with components=object.component_set.all %}

component_catalog/tests/test_models.py

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,6 +2556,27 @@ def test_package_model_inferred_url_property(self):
25562556
expected = "https://github.com/package-url/packageurl-python/tree/v0.10.4"
25572557
self.assertEqual(expected, package1.inferred_url)
25582558

2559+
@mock.patch("dejacode_toolkit.purldb.PurlDB.find_packages")
2560+
def test_package_model_get_purldb_entries(self, mock_find_packages):
2561+
purl = "pkg:pypi/django@3.0"
2562+
package1 = make_package(self.dataspace, package_url=purl)
2563+
purldb_entry1 = {
2564+
"purl": purl,
2565+
"type": "pypi",
2566+
"name": "django",
2567+
"version": "3.0",
2568+
}
2569+
purldb_entry2 = {
2570+
"purl": "pkg:pypi/django",
2571+
"type": "pypi",
2572+
"name": "django",
2573+
}
2574+
2575+
mock_find_packages.return_value = [purldb_entry1, purldb_entry2]
2576+
purldb_entries = package1.get_purldb_entries(user=self.user)
2577+
# The purldb_entry2 is excluded as the PURL differs
2578+
self.assertEqual([purldb_entry1], purldb_entries)
2579+
25592580
@mock.patch("component_catalog.models.Package.get_purldb_entries")
25602581
def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
25612582
purldb_entry = {
@@ -2577,9 +2598,9 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
25772598
}
25782599

25792600
mock_get_purldb_entries.return_value = [purldb_entry]
2580-
package1 = Package.objects.create(
2601+
package1 = make_package(
2602+
self.dataspace,
25812603
filename="package",
2582-
dataspace=self.dataspace,
25832604
# "unknown" values are overrided
25842605
declared_license_expression="unknown",
25852606
)
@@ -2607,6 +2628,68 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
26072628
for field_name in updated_fields:
26082629
self.assertEqual(purldb_entry[field_name], getattr(package1, field_name))
26092630

2631+
@mock.patch("component_catalog.models.Package.get_purldb_entries")
2632+
def test_package_model_update_from_purldb_multiple_entries(self, mock_get_purldb_entries):
2633+
purldb_entry1 = {
2634+
"uuid": "326aa7a8-4f28-406d-89f9-c1404916925b",
2635+
"purl": "pkg:pypi/django@3.0",
2636+
"type": "pypi",
2637+
"name": "django",
2638+
"version": "3.0",
2639+
"keywords": ["Keyword1", "Keyword2"],
2640+
"filename": "Django-3.0.tar.gz",
2641+
"download_url": "https://files.pythonhosted.org/packages/38/Django-3.0.tar.gz",
2642+
}
2643+
purldb_entry2 = {
2644+
"uuid": "e133e70b-8dd3-4cf1-9711-72b1f57523a0",
2645+
"purl": "pkg:pypi/django@3.0",
2646+
"type": "pypi",
2647+
"name": "django",
2648+
"version": "3.0",
2649+
"primary_language": "Python",
2650+
"keywords": ["Keyword1", "Keyword2"],
2651+
"download_url": "https://another.url/Django-3.0.tar.gz",
2652+
}
2653+
2654+
mock_get_purldb_entries.return_value = [purldb_entry1, purldb_entry2]
2655+
package1 = make_package(self.dataspace, package_url="pkg:pypi/django@3.0")
2656+
updated_fields = package1.update_from_purldb(self.user)
2657+
expected = ["filename", "keywords", "primary_language"]
2658+
self.assertEqual(expected, sorted(updated_fields))
2659+
self.assertEqual("Django-3.0.tar.gz", package1.filename)
2660+
self.assertEqual(["Keyword1", "Keyword2"], package1.keywords)
2661+
self.assertEqual("Python", package1.primary_language)
2662+
2663+
@mock.patch("component_catalog.models.Package.get_purldb_entries")
2664+
def test_package_model_update_from_purldb_duplicate_exception(self, mock_get_purldb_entries):
2665+
package_url = "pkg:pypi/django@3.0"
2666+
download_url = "https://files.pythonhosted.org/packages/38/Django-3.0.tar.gz"
2667+
purldb_entry = {
2668+
"purl": package_url,
2669+
"type": "pypi",
2670+
"name": "django",
2671+
"version": "3.0",
2672+
"download_url": download_url,
2673+
"description": "This value will be updated",
2674+
"md5": "This value is skipped",
2675+
"sha1": "This value is skipped",
2676+
}
2677+
mock_get_purldb_entries.return_value = [purldb_entry]
2678+
2679+
# 2 packages with the same "pkg:pypi/django@3.0" PURL:
2680+
# - 1 with a `download_url` value
2681+
# - 1 without a `download_url` value
2682+
make_package(self.dataspace, package_url=package_url, download_url=download_url)
2683+
package_no_download_url = make_package(self.dataspace, package_url=package_url)
2684+
2685+
# Updating the package with the `download_url` from the purldb_entry data
2686+
# would violates the unique constraint.
2687+
# This is handle properly by update_from_purldb.
2688+
updated_fields = package_no_download_url.update_from_purldb(self.user)
2689+
self.assertEqual(["description"], updated_fields)
2690+
package_no_download_url.refresh_from_db()
2691+
self.assertEqual(purldb_entry["description"], package_no_download_url.description)
2692+
26102693
def test_package_model_vulnerability_queryset_mixin(self):
26112694
package1 = make_package(self.dataspace, is_vulnerable=True)
26122695
package2 = make_package(self.dataspace)

component_catalog/tests/test_views.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,20 +1194,20 @@ def test_package_list_view_download_column(self):
11941194
response = self.client.get(reverse("component_catalog:package_list"))
11951195

11961196
expected = f"""
1197-
<td title="{self.package1.download_url}">
1197+
<div title="{self.package1.download_url}">
11981198
<a href="{self.package1.download_url}">
11991199
{self.package1.filename}
12001200
</a>
1201-
</td>
1201+
</div>
12021202
"""
12031203
self.assertContains(response, expected, html=True)
12041204

12051205
expected = f"""
1206-
<td title="{self.package2.download_url}" class="text-truncate">
1206+
<div title="{self.package2.download_url}" class="text-truncate">
12071207
<a href="{self.package2.download_url}">
12081208
{self.package2.download_url}
12091209
</a>
1210-
</td>
1210+
</div>
12111211
"""
12121212
self.assertContains(response, expected, html=True)
12131213

component_catalog/views.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,9 +2435,12 @@ def get_tab_fields(self):
24352435
}
24362436
tab_fields.append(("", alert_context, None, "includes/field_alert.html"))
24372437

2438-
if len(purldb_entries) > 1:
2438+
len_purldb_entries = len(purldb_entries)
2439+
if len_purldb_entries > 1:
24392440
alert_context = {
2440-
"message": "There are multiple entries in the PurlDB for this Package.",
2441+
"message": (
2442+
f"There are {len_purldb_entries} entries in the PurlDB for this Package."
2443+
),
24412444
"full_width": True,
24422445
"alert_class": "alert-warning",
24432446
}

dejacode_toolkit/purldb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def get_package(self, uuid):
5656

5757
def get_package_by_purl(self, package_url):
5858
"""Get a Package details entry providing its `package_url`."""
59-
if results := self.find_packages({"purl": package_url}):
59+
if results := self.find_packages(payload={"purl": package_url}):
6060
return results[0]
6161

6262
def find_packages(self, payload, timeout=None):

0 commit comments

Comments
 (0)