Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion sagemaker-core/src/sagemaker/core/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,22 @@ def _clone_command_for_codecommit_https(git_config, dest_dir):
_run_clone_command(updated_url, dest_dir)


def _redact_credentials_from_url(url):
"""Redact credentials embedded in an HTTPS Git URL.

Replaces any username, password, or token embedded before the '@' in an
HTTPS URL with a placeholder so that credentials are never exposed in
logs or exception messages.

Args:
url (str): The Git repository URL that may contain embedded credentials.

Returns:
str: The URL with credentials replaced by '<credentials-redacted>'.
"""
return re.sub(r"(https://)([^@]+)@", r"\1<credentials-redacted>@", url)


def _run_clone_command(repo_url, dest_dir):
"""Run the 'git clone' command with the repo url and the directory to clone the repo into.

Expand All @@ -343,7 +359,24 @@ def _run_clone_command(repo_url, dest_dir):
my_env = os.environ.copy()
if repo_url.startswith("https://"):
my_env["GIT_TERMINAL_PROMPT"] = "0"
subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
try:
subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
except subprocess.CalledProcessError as e:
# Re-raise with credentials redacted from the command to prevent
# plaintext tokens/passwords from leaking into logs or tracebacks.
safe_url = _redact_credentials_from_url(repo_url)
raise subprocess.CalledProcessError(
e.returncode,
["git", "clone", safe_url, dest_dir],
output=e.output,
stderr=e.stderr,
) from None
except Exception as e:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The catch-all except Exception as e uses type(e)(str(e).replace(repo_url, safe_url)) which assumes the exception type can be reconstructed from a single string argument. This could fail for custom exception types with different constructors. Consider wrapping in a secondary try/except or logging a warning.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, adding in a new commit.

safe_url = _redact_credentials_from_url(repo_url)
try:
raise type(e)(str(e).replace(repo_url, safe_url)) from None
except TypeError:
raise RuntimeError(str(e).replace(repo_url, safe_url)) from None
elif repo_url.startswith("git@") or repo_url.startswith("ssh://"):
try:
with tempfile.TemporaryDirectory() as tmp_dir:
Expand Down
125 changes: 125 additions & 0 deletions sagemaker-core/tests/unit/test_git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import subprocess

import pytest

from sagemaker.core import git_utils
Expand Down Expand Up @@ -308,6 +310,129 @@ def test_git_clone_repo_blocks_url_encoded_attack(self):
git_utils.git_clone_repo(malicious_git_config, entry_point)
assert "Suspicious URL encoding detected" in str(error.value)

class TestCredentialRedaction:
"""Test cases for credential redaction in clone error handling."""

def test_redact_token_from_url(self):
"""Test that a token embedded in an HTTPS URL is redacted."""
url = "https://ghp_SuperSecretToken123@github.com/user/repo.git"
result = git_utils._redact_credentials_from_url(url)
assert "ghp_SuperSecretToken123" not in result
assert result == "https://<credentials-redacted>@github.com/user/repo.git"

def test_redact_username_password_from_url(self):
"""Test that username:password embedded in an HTTPS URL is redacted."""
url = "https://myuser:mypassword@github.com/user/repo.git"
result = git_utils._redact_credentials_from_url(url)
assert "myuser" not in result
assert "mypassword" not in result
assert result == "https://<credentials-redacted>@github.com/user/repo.git"

def test_redact_url_encoded_password(self):
"""Test that URL-encoded credentials are redacted."""
url = "https://user:p%40ss%20word@git-codecommit.us-east-1.amazonaws.com/v1/repos/myrepo"
result = git_utils._redact_credentials_from_url(url)
assert "p%40ss%20word" not in result
assert "<credentials-redacted>" in result

def test_no_redaction_without_credentials(self):
"""Test that URLs without credentials are unchanged."""
url = "https://github.com/user/repo.git"
result = git_utils._redact_credentials_from_url(url)
assert result == url

def test_no_redaction_for_ssh_url(self):
"""Test that SSH URLs are not affected by redaction."""
url = "git@github.com:user/repo.git"
result = git_utils._redact_credentials_from_url(url)
assert result == url

@pytest.fixture
def mock_env(self, monkeypatch):
"""Set minimal env for subprocess calls."""
monkeypatch.setenv("PATH", "/usr/bin:/bin")

def test_clone_failure_redacts_token(self, mock_env):
"""Test that CalledProcessError from a failed clone does not contain the token."""
from unittest.mock import patch

token_url = "https://ghp_secret123@github.com/user/repo.git"
with patch(
"subprocess.check_call",

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mocked subprocess.check_call method is fine but we should consider adding an integration test with an actual invalid git URL

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We did test this locally with a real git clone against an actual invalid URL (https://myuser:ghp_SuperSecretToken123@github.com/nonexistent/repo-xyz) — no mocks. GitHub returned a real Authentication failed, and we confirmed the re-raised exception contains only with no trace of the original credentials.

We didn't add it as a formal integration test in CI because it depends on external network access to GitHub, which makes it flaky and environment-dependent — CI runners may have restricted outbound access, and GitHub rate limits or outages would cause false failures. The unit tests with mocked subprocess cover the same code paths deterministically.

Checked the existing integ test suite — there are no integration tests that perform real git clone operations against GitHub. All git_utils tests are unit tests with mocked subprocess. So not adding a network-dependent integ test is consistent with the current test approach.

side_effect=subprocess.CalledProcessError(
128, ["git", "clone", token_url, "/tmp/dest"]
),
):
with pytest.raises(subprocess.CalledProcessError) as exc_info:
git_utils._run_clone_command(token_url, "/tmp/dest")

# The token must NOT appear anywhere in the re-raised exception
assert "ghp_secret123" not in str(exc_info.value)
assert "ghp_secret123" not in str(exc_info.value.cmd)
assert "<credentials-redacted>" in str(exc_info.value.cmd)

def test_clone_failure_redacts_username_password(self, mock_env):
"""Test that CalledProcessError from a failed clone does not contain username/password."""
from unittest.mock import patch

cred_url = "https://admin:hunter2@github.com/org/repo.git"
with patch(
"subprocess.check_call",
side_effect=subprocess.CalledProcessError(
128, ["git", "clone", cred_url, "/tmp/dest"]
),
):
with pytest.raises(subprocess.CalledProcessError) as exc_info:
git_utils._run_clone_command(cred_url, "/tmp/dest")

assert "admin" not in str(exc_info.value.cmd)
assert "hunter2" not in str(exc_info.value.cmd)
assert "<credentials-redacted>" in str(exc_info.value.cmd)

def test_clone_failure_redacts_codecommit_credentials(self, mock_env):
"""Test that CodeCommit HTTPS credentials are redacted on failure."""
from unittest.mock import patch

cc_url = "https://user:pass@git-codecommit.us-east-1.amazonaws.com/v1/repos/myrepo"
with patch(
"subprocess.check_call",
side_effect=subprocess.CalledProcessError(
128, ["git", "clone", cc_url, "/tmp/dest"]
),
):
with pytest.raises(subprocess.CalledProcessError) as exc_info:
git_utils._run_clone_command(cc_url, "/tmp/dest")

assert "user:pass" not in str(exc_info.value.cmd)
assert "<credentials-redacted>" in str(exc_info.value.cmd)

def test_clone_failure_suppresses_exception_chain(self, mock_env):
"""Test that the original exception chain is suppressed (from None)."""
from unittest.mock import patch

token_url = "https://ghp_secret@github.com/user/repo.git"
with patch(
"subprocess.check_call",
side_effect=subprocess.CalledProcessError(
128, ["git", "clone", token_url, "/tmp/dest"]
),
):
with pytest.raises(subprocess.CalledProcessError) as exc_info:
git_utils._run_clone_command(token_url, "/tmp/dest")

# __cause__ should be None due to 'from None'
assert exc_info.value.__cause__ is None

def test_clone_success_no_exception(self, mock_env):
"""Test that successful clone does not raise."""
from unittest.mock import patch

url = "https://ghp_token@github.com/user/repo.git"
with patch("subprocess.check_call"):
# Should not raise
git_utils._run_clone_command(url, "/tmp/dest")


def test_sanitize_git_url_comprehensive_attack_scenarios(self):
attack_scenarios = [
"https://USER@YOUR_NGROK_OR_LOCALHOST/malicious.git@github.com%25legit%25repo.git",
Expand Down
Loading