Skip to content

Commit 468d67e

Browse files
committed
refactor(config): move check_deliverability setting to defaults.ini
Signed-off-by: Amine <amine.raouane@enim.ac.ma>
1 parent a8d373b commit 468d67e

File tree

4 files changed

+117
-88
lines changed

4 files changed

+117
-88
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ dependencies = [
3838
"problog >= 2.2.6,<3.0.0",
3939
"cryptography >=44.0.0,<45.0.0",
4040
"semgrep == 1.113.0",
41-
"email_validator >=2.2.0,<3.0.0",
41+
"email-validator >=2.2.0,<3.0.0",
4242
]
4343
keywords = []
4444
# https://pypi.org/classifiers/

src/macaron/config/defaults.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,9 @@ cost = 1.0
612612
# The path to the file that contains the list of popular packages.
613613
popular_packages_path =
614614

615+
# A boolean value that determines whether to check the deliverability of the email address.
616+
check_deliverability = True
617+
615618
# ==== The following sections are for source code analysis using Semgrep ====
616619
# rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules.
617620
# rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in

src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
"""The heuristic analyzer to check the email address of the package maintainers."""
55

66
import logging
7+
import re
78

89
from email_validator import EmailNotValidError, ValidatedEmail, validate_email
910

11+
from macaron.config.defaults import defaults
12+
from macaron.errors import HeuristicAnalyzerValueError
1013
from macaron.json_tools import JsonType, json_extract
1114
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
1215
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
@@ -18,12 +21,49 @@
1821
class FakeEmailAnalyzer(BaseHeuristicAnalyzer):
1922
"""Analyze the email address of the package maintainers."""
2023

24+
PATTERN = re.compile(
25+
r"""\b # word‑boundary
26+
[A-Za-z0-9]+ # first alpha‑numeric segment
27+
(?:\.[A-Za-z0-9]+)* # optional “.segment” repeats
28+
@
29+
[A-Za-z0-9]+ # domain name segment
30+
(?:\.[A-Za-z0-9]+)* # optional sub‑domains
31+
\.[A-Za-z]{2,} # top‑level domain (at least 2 letters)
32+
\b""",
33+
re.VERBOSE,
34+
)
35+
2136
def __init__(self) -> None:
2237
super().__init__(
2338
name="fake_email_analyzer",
2439
heuristic=Heuristics.FAKE_EMAIL,
2540
depends_on=None,
2641
)
42+
self.check_deliverability: bool = self._load_defaults()
43+
44+
def _load_defaults(self) -> bool:
45+
"""Load the default values from defaults.ini."""
46+
section_name = "heuristic.pypi"
47+
if defaults.has_section(section_name):
48+
section = defaults[section_name]
49+
return section.getboolean("check_deliverability", fallback=True)
50+
return True
51+
52+
def get_emails(self, email_field: str) -> list[str]:
53+
"""Extract emails from the given email field.
54+
55+
Parameters
56+
----------
57+
email_field: str
58+
The email field from which to extract emails.
59+
60+
Returns
61+
-------
62+
list[str]
63+
A list of emails extracted from the email field.
64+
"""
65+
emails = self.PATTERN.findall(email_field)
66+
return [email.strip() for email in emails if email.strip()]
2767

2868
def is_valid_email(self, email: str) -> ValidatedEmail | None:
2969
"""Check if the email format is valid and the domain has MX records.
@@ -37,15 +77,10 @@ def is_valid_email(self, email: str) -> ValidatedEmail | None:
3777
-------
3878
ValidatedEmail | None
3979
The validated email object if the email is valid, otherwise None.
40-
41-
Raises
42-
------
43-
HeuristicAnalyzerValueError
44-
if the failure is due to DNS resolution.
4580
"""
4681
emailinfo = None
4782
try:
48-
emailinfo = validate_email(email, check_deliverability=True)
83+
emailinfo = validate_email(email, check_deliverability=self.check_deliverability)
4984
except EmailNotValidError as err:
5085
err_message = f"Invalid email address: {email}. Error: {err}"
5186
logger.warning(err_message)
@@ -63,15 +98,10 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
6398
-------
6499
tuple[HeuristicResult, dict[str, JsonType]]:
65100
The result and related information collected during the analysis.
66-
67-
Raises
68-
------
69-
HeuristicAnalyzerValueError
70-
if the analysis fails.
71101
"""
72102
package_json = pypi_package_json.package_json
73103
if not package_json.get("info", {}):
74-
return HeuristicResult.SKIP, {"message": "No package info available."}
104+
raise HeuristicAnalyzerValueError("No package info available.")
75105

76106
author_email = json_extract(package_json, ["info", "author_email"], str)
77107
maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str)
@@ -82,12 +112,17 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
82112
validated_emails: list[JsonType] = []
83113
details = ["normalized", "local_part", "domain"]
84114

85-
for email in [author_email, maintainer_email]:
86-
if email:
87-
email_info = self.is_valid_email(email)
88-
if not email_info:
89-
return HeuristicResult.FAIL, {"email": email}
115+
for email_field in [author_email, maintainer_email]:
116+
if email_field:
117+
emails = self.get_emails(email_field)
118+
if not emails:
119+
return HeuristicResult.FAIL, {"message": "no emails found in the email field"}
120+
121+
for email in emails:
122+
email_info = self.is_valid_email(email)
123+
if not email_info:
124+
return HeuristicResult.FAIL, {"invalid_email": email}
90125

91-
validated_emails.append({key: getattr(email_info, key) for key in details})
126+
validated_emails.append({key: getattr(email_info, key) for key in details})
92127

93128
return HeuristicResult.PASS, {"validated_emails": validated_emails}

tests/malware_analyzer/pypi/test_fake_email.py

Lines changed: 60 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
"""Tests for the FakeEmailAnalyzer heuristic."""
55

66

7-
from collections.abc import Generator
8-
from unittest.mock import MagicMock, patch
7+
from unittest.mock import MagicMock
98

109
import pytest
11-
from email_validator import EmailNotValidError
1210

11+
from macaron.errors import HeuristicAnalyzerValueError
1312
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
1413
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
1514
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
@@ -22,20 +21,13 @@ def analyzer_() -> FakeEmailAnalyzer:
2221

2322

2423
@pytest.fixture(name="pypi_package_json_asset_mock")
25-
def pypi_package_json_asset_mock_fixture() -> MagicMock:
24+
def pypi_package_json_asset_mock_() -> MagicMock:
2625
"""Pytest fixture for a mock PyPIPackageJsonAsset."""
2726
mock_asset = MagicMock(spec=PyPIPackageJsonAsset)
2827
mock_asset.package_json = {}
2928
return mock_asset
3029

3130

32-
@pytest.fixture(name="mock_validate_email")
33-
def mock_validate_email_fixture() -> Generator[MagicMock]:
34-
"""Patch validate_email and mock its behavior."""
35-
with patch("macaron.malware_analyzer.pypi_heuristics.metadata.fake_email.validate_email") as mock:
36-
yield mock
37-
38-
3931
def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
4032
"""Test the analyzer skips if no author_email or maintainer_email is present."""
4133
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": None}}
@@ -44,99 +36,98 @@ def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_packag
4436
assert info["message"] == "No author or maintainer email available."
4537

4638

47-
def test_analyze_skip_no_info_key(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
48-
"""Test the analyzer skips if 'info' key is missing in PyPI data."""
39+
def test_analyze_raises_error_for_missing_info_key(
40+
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
41+
) -> None:
42+
"""Test the analyzer raises an error if the 'info' key is missing in the PyPI data."""
4943
pypi_package_json_asset_mock.package_json = {} # No 'info' key
50-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
51-
assert result == HeuristicResult.SKIP
52-
assert info["message"] == "No package info available."
44+
with pytest.raises(HeuristicAnalyzerValueError) as exc_info:
45+
analyzer.analyze(pypi_package_json_asset_mock)
46+
assert "No package info available." in str(exc_info.value)
5347

5448

55-
def test_analyze_fail_invalid_email(
56-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock, mock_validate_email: MagicMock
49+
def test_analyze_fail_no_email_found_in_field(
50+
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
5751
) -> None:
58-
"""Test analyzer fails for an invalid email format."""
59-
invalid_email = "invalid-email"
52+
"""Test the analyzer fails if an email field does not contain a parsable email address."""
53+
pypi_package_json_asset_mock.package_json = {"info": {"author_email": "not an email", "maintainer_email": None}}
54+
result, info = analyzer.analyze(pypi_package_json_asset_mock)
55+
assert result == HeuristicResult.FAIL
56+
assert info == {"message": "no emails found in the email field"}
57+
58+
59+
def test_analyze_fail_invalid_email(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
60+
"""Test analyzer fails if the email field contains an invalid email format."""
61+
invalid_email = "user@example"
6062
pypi_package_json_asset_mock.package_json = {"info": {"author_email": invalid_email, "maintainer_email": None}}
61-
mock_validate_email.side_effect = EmailNotValidError("Invalid email.")
6263

6364
result, info = analyzer.analyze(pypi_package_json_asset_mock)
64-
6565
assert result == HeuristicResult.FAIL
66-
assert info == {"email": invalid_email}
67-
mock_validate_email.assert_called_once_with(invalid_email, check_deliverability=True)
66+
assert info == {"message": "no emails found in the email field"}
6867

6968

7069
def test_analyze_pass_only_maintainer_email_valid(
71-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock, mock_validate_email: MagicMock
70+
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
7271
) -> None:
73-
"""Test analyzer passes when only maintainer_email is present and valid."""
72+
"""Test the analyzer passes if only a valid maintainer_email is present and deliverability is not checked."""
7473
email = "maintainer@example.net"
7574
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": email}}
75+
result, info = analyzer.analyze(pypi_package_json_asset_mock)
7676

77-
mock_email_info = MagicMock()
78-
mock_email_info.normalized = "maintainer@example.net"
79-
mock_email_info.local_part = "maintainer"
80-
mock_email_info.domain = "example.net"
81-
mock_validate_email.return_value = mock_email_info
77+
if analyzer.check_deliverability:
78+
assert result == HeuristicResult.FAIL
79+
assert info == {"invalid_email": email}
80+
return
8281

83-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
8482
assert result == HeuristicResult.PASS
8583
assert info["validated_emails"] == [
8684
{"normalized": "maintainer@example.net", "local_part": "maintainer", "domain": "example.net"}
8785
]
88-
mock_validate_email.assert_called_once_with(email, check_deliverability=True)
8986

9087

91-
def test_analyze_pass_both_emails_valid(
92-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock, mock_validate_email: MagicMock
93-
) -> None:
94-
"""Test the analyzer passes when both emails are present and valid."""
95-
96-
def side_effect(email: str, check_deliverability: bool) -> MagicMock: # pylint: disable=unused-argument
97-
local_part, domain = email.split("@")
98-
mock_email_info = MagicMock()
99-
mock_email_info.normalized = email
100-
mock_email_info.local_part = local_part
101-
mock_email_info.domain = domain
102-
return mock_email_info
103-
104-
mock_validate_email.side_effect = side_effect
88+
def test_analyze_pass_both_emails_valid(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
89+
"""Test the analyzer passes if both emails are valid and deliverability is not checked."""
90+
author_email = "example@gmail.com"
91+
author_local_part, author_domain = author_email.split("@")
92+
maintainer_email = "maintainer@example.net"
93+
maintainer_local_part, maintainer_domain = maintainer_email.split("@")
10594

10695
pypi_package_json_asset_mock.package_json = {
107-
"info": {"author_email": "author@example.com", "maintainer_email": "maintainer@example.net"}
96+
"info": {"author_email": author_email, "maintainer_email": maintainer_email}
10897
}
10998
result, info = analyzer.analyze(pypi_package_json_asset_mock)
99+
if analyzer.check_deliverability:
100+
assert result == HeuristicResult.FAIL
101+
assert info == {"invalid_email": maintainer_email}
102+
return
103+
110104
assert result == HeuristicResult.PASS
111-
assert mock_validate_email.call_count == 2
112105

113106
validated_emails = info.get("validated_emails")
114107
assert isinstance(validated_emails, list)
115108
assert len(validated_emails) == 2
116-
assert {"normalized": "author@example.com", "local_part": "author", "domain": "example.com"} in validated_emails
109+
assert {"normalized": author_email, "local_part": author_local_part, "domain": author_domain} in validated_emails
117110
assert {
118-
"normalized": "maintainer@example.net",
119-
"local_part": "maintainer",
120-
"domain": "example.net",
111+
"normalized": maintainer_email,
112+
"local_part": maintainer_local_part,
113+
"domain": maintainer_domain,
121114
} in validated_emails
122115

123116

124-
def test_is_valid_email_success(analyzer: FakeEmailAnalyzer, mock_validate_email: MagicMock) -> None:
125-
"""Test is_valid_email returns the validation object on success."""
126-
mock_validated_email = MagicMock()
127-
mock_validated_email.normalized = "test@example.com"
128-
mock_validated_email.local_part = "test"
129-
mock_validated_email.domain = "example.com"
130-
131-
mock_validate_email.return_value = mock_validated_email
132-
result = analyzer.is_valid_email("test@example.com")
133-
assert result == mock_validated_email
134-
mock_validate_email.assert_called_once_with("test@example.com", check_deliverability=True)
135-
136-
137-
def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer, mock_validate_email: MagicMock) -> None:
117+
def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer) -> None:
138118
"""Test is_valid_email returns None on failure."""
139-
mock_validate_email.side_effect = EmailNotValidError("The email address is not valid.")
140119
result = analyzer.is_valid_email("invalid-email")
141120
assert result is None
142-
mock_validate_email.assert_called_once_with("invalid-email", check_deliverability=True)
121+
122+
123+
def test_get_emails(analyzer: FakeEmailAnalyzer) -> None:
124+
"""Test the get_emails method."""
125+
email_field = "test@example.com, another test <another@example.org>"
126+
expected = ["test@example.com", "another@example.org"]
127+
assert analyzer.get_emails(email_field) == expected
128+
129+
email_field_no_email = "this is not an email"
130+
assert analyzer.get_emails(email_field_no_email) == []
131+
132+
email_field_empty = ""
133+
assert analyzer.get_emails(email_field_empty) == []

0 commit comments

Comments
 (0)