Skip to content

Commit ec4e190

Browse files
nathanwntromai
andauthored
perf: use partial clone to reduce clone time (#389)
Signed-off-by: Nathan Nguyen <nathan.nguyen@oracle.com> Signed-off-by: Trong Nhan Mai <trong.nhan.mai@oracle.com> Co-authored-by: Trong Nhan Mai <trong.nhan.mai@oracle.com>
1 parent 37a96bb commit ec4e190

File tree

4 files changed

+143
-12
lines changed

4 files changed

+143
-12
lines changed

docs/source/pages/developers_guide/apidoc/macaron.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,14 @@ Subpackages
2525
Submodules
2626
----------
2727

28+
macaron.environment\_variables module
29+
-------------------------------------
30+
31+
.. automodule:: macaron.environment_variables
32+
:members:
33+
:undoc-members:
34+
:show-inheritance:
35+
2836
macaron.errors module
2937
---------------------
3038

src/macaron/environment_variables.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Helper functions related to environment variables."""
5+
6+
import os
7+
from collections.abc import Mapping
8+
9+
10+
def get_patched_env(
11+
patch: Mapping[str, str | None],
12+
_env: dict[str, str] | None = None,
13+
) -> dict[str, str]:
14+
"""Return a dictionary whose elements copied from ``os.environ`` and are updated according to ``patch``.
15+
16+
This function does not modify ``os.environ``.
17+
18+
Parameters
19+
----------
20+
patch : Mapping[str, str | None]
21+
A mapping (immutable) in which:
22+
- each key is an environment variable.
23+
- each value is the value to set to the corresponding environment variable.
24+
If value is ``None``, the environment variable is "unset".
25+
_env : dict[str, str] | None
26+
The environment being updated.
27+
This is ``None`` by default, in which case ``os.environ`` is being updated.
28+
29+
Returns
30+
-------
31+
dict[str, str]
32+
The the dictionary contains the patched env variables.
33+
"""
34+
env = os.environ if _env is None else _env
35+
36+
# Make a copy of the environment.
37+
copied_env = dict(env)
38+
39+
for var, value in patch.items():
40+
if value is None:
41+
copied_env.pop(var, None)
42+
else:
43+
copied_env[var] = value
44+
45+
return copied_env

src/macaron/slsa_analyzer/git_url.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@
88
import os
99
import re
1010
import string
11+
import subprocess # nosec B404
1112
import urllib.parse
1213
from configparser import ConfigParser
14+
from pathlib import Path
1315

1416
from git import GitCommandError
1517
from git.objects import Commit
1618
from git.repo import Repo
1719
from pydriller.git import Git
1820

1921
from macaron.config.defaults import defaults
22+
from macaron.environment_variables import get_patched_env
2023
from macaron.errors import CloneError
2124

2225
logger: logging.Logger = logging.getLogger(__name__)
@@ -235,6 +238,12 @@ def clone_remote_repo(clone_dir: str, url: str) -> Repo | None:
235238
This could happen when multiple runs of Macaron use the same `<output_dir>`, leading
236239
to Macaron potentially trying to clone a repository multiple times.
237240
241+
We use treeless partial clone to reduce clone time, by retrieving trees and blobs lazily.
242+
For more details, see the following:
243+
- https://git-scm.com/docs/partial-clone
244+
- https://git-scm.com/docs/git-rev-list
245+
- https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone
246+
238247
Parameters
239248
----------
240249
clone_dir : str
@@ -268,20 +277,38 @@ def clone_remote_repo(clone_dir: str, url: str) -> Repo | None:
268277
)
269278
return None
270279

280+
# Ensure that the parent directory where the repo is cloned into exists.
281+
parent_dir = Path(clone_dir).parent
282+
parent_dir.mkdir(parents=True, exist_ok=True)
283+
271284
try:
272-
# The Repo.clone_from method handles creating intermediate dirs.
273-
return Repo.clone_from(
274-
url=url,
275-
to_path=clone_dir,
276-
env={
277-
# Setting the GIT_TERMINAL_PROMPT environment variable to ``0`` stops
278-
# ``git clone`` from prompting for login credentials.
279-
"GIT_TERMINAL_PROMPT": "0",
280-
},
285+
git_env_patch = {
286+
# Setting the GIT_TERMINAL_PROMPT environment variable to ``0`` stops
287+
# ``git clone`` from prompting for login credentials.
288+
"GIT_TERMINAL_PROMPT": "0",
289+
}
290+
result = subprocess.run( # nosec B603
291+
args=["git", "clone", "--filter=tree:0", url],
292+
capture_output=True,
293+
cwd=parent_dir,
294+
# If `check=True` and return status code is not zero, subprocess.CalledProcessError is
295+
# raised, which we don't want. We want to check the return status code of the subprocess
296+
# later on.
297+
check=False,
298+
env=get_patched_env(git_env_patch),
281299
)
282-
except GitCommandError as error:
283-
# stderr here does not contain secrets, so it is safe for logging.
284-
raise CloneError(error.stderr) from None
300+
except (subprocess.CalledProcessError, OSError):
301+
# Here, we raise from ``None`` to be extra-safe that no token is leaked.
302+
# We should never store or print out the captured output from the subprocess
303+
# because they might contain the secret-embedded URL.
304+
raise CloneError("Failed to clone repository.") from None
305+
306+
if result.returncode != 0:
307+
raise CloneError(
308+
"Failed to clone repository: the `git clone --filter=tree:0` command exited with non-zero return code."
309+
)
310+
311+
return Repo(path=clone_dir)
285312

286313

287314
def get_repo_name_from_url(url: str) -> str:

tests/test_environment_variables.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Tests for helper functions related to environment variables."""
5+
6+
import pytest
7+
8+
from macaron.environment_variables import get_patched_env
9+
10+
11+
@pytest.mark.parametrize(
12+
("before", "patch", "expect"),
13+
[
14+
pytest.param(
15+
{"FOO": "some-value"},
16+
{},
17+
{"FOO": "some-value"},
18+
id="patch is empty",
19+
),
20+
pytest.param(
21+
{"FOO": "some-value"},
22+
{"GIT_TERMINAL_PROMPT": "0"},
23+
{
24+
"FOO": "some-value",
25+
"GIT_TERMINAL_PROMPT": "0",
26+
},
27+
id="patch adding a variable",
28+
),
29+
pytest.param(
30+
{"GIT_TERMINAL_PROMPT": "1"},
31+
{"GIT_TERMINAL_PROMPT": "0"},
32+
{"GIT_TERMINAL_PROMPT": "0"},
33+
id="patch overriding a variable",
34+
),
35+
pytest.param(
36+
{"GIT_TERMINAL_PROMPT": "0"},
37+
{"GIT_TERMINAL_PROMPT": None},
38+
{},
39+
id="patch removing a variable",
40+
),
41+
],
42+
)
43+
def test_patched_env(
44+
before: dict[str, str],
45+
patch: dict[str, str | None],
46+
expect: dict[str, str],
47+
) -> None:
48+
"""Tests for the ``get_patched_env`` helper function."""
49+
env = dict(before)
50+
51+
assert get_patched_env(patch, env) == expect

0 commit comments

Comments
 (0)