Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 28 additions & 25 deletions src/package_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,30 @@ def _search_package_upstream_version(package, match_spec_out, target_version):
"""
is_major_version_release = target_version.minor == 0 and target_version.patch == 0
is_minor_version_release = target_version.patch == 0 and not is_major_version_release

package_version = str(match_spec_out.get("version")).removeprefix("==")
try:
package_version = get_semver(package_version)
except ValueError:
print(f"Skipping package {package} with non-semver version: {package_version}")
return package, None

channel = match_spec_out.get("channel").channel_name
subdir_filter = "[subdir=" + match_spec_out.get("subdir") + "]"

search_query = f"{channel}::{package}>={str(package_version)}{subdir_filter}"
command = ["conda", "search", search_query, "--json"]

search_result = conda_search_with_retry(command, package)
if search_result is None:
return package, None

try:
package_metadata = json.loads(search_result.stdout)[package]
except (json.JSONDecodeError, KeyError) as e:
print(f"Error parsing search result for package {package}: {str(e)}")
return package, None

# Response is of the structure
# { 'package_name': [{'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
# <version number>}, ..., {'url':<someurl>, 'dependencies': <List of dependencies>, 'version':
Expand All @@ -82,11 +82,11 @@ def _search_package_dependency(package, version):
Search for a single package's dependency information using conda search with retry logic
"""
command = ["conda", "search", "-c", "conda-forge", f"{package}=={version}", "--json"]

search_result = conda_search_with_retry(command, package, max_retries=3, base_delay=0.5)
if search_result is None:
return package, "N/A - search failed after retries"

try:
package_metadata = json.loads(search_result.stdout)[package][0]
return package, {"version": package_metadata["version"], "depends": package_metadata["depends"]}
Expand All @@ -95,41 +95,42 @@ def _search_package_dependency(package, version):
return package, "N/A - parse error"


def _get_package_versions_in_upstream(target_packages_match_spec_out, target_version, max_workers: int = 20) -> dict[str, str]:
def _get_package_versions_in_upstream(
target_packages_match_spec_out, target_version, max_workers: int = 20
) -> dict[str, str]:
"""
Get package versions in upstream using parallel conda search calls for improved performance.
This function preserves the original logic for determining latest relevant versions based on
This function preserves the original logic for determining latest relevant versions based on
major/minor/patch release types.
"""
package_to_version_mapping = {}

tasks = [
(package, match_spec_out, target_version)
for package, match_spec_out in target_packages_match_spec_out.items()
(package, match_spec_out, target_version) for package, match_spec_out in target_packages_match_spec_out.items()
]

if not tasks:
print("No packages to search")
return package_to_version_mapping

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_package = {
executor.submit(_search_package_upstream_version, package, match_spec_out, target_version): package
for package, match_spec_out, target_version in tasks
}

completed = 0
for future in as_completed(future_to_package):
completed += 1

try:
package_name, package_version = future.result()
if package_version:
package_to_version_mapping[package_name] = package_version

except Exception as e:
print(f"Unexpected error processing package upstream version: {e}")

return package_to_version_mapping


Expand Down Expand Up @@ -330,38 +331,40 @@ def _generate_python_package_size_report_per_image(
return validate_result


def _generate_python_package_dependency_report(image_config, base_version_dir, target_version_dir, max_workers: int = 20):
def _generate_python_package_dependency_report(
image_config, base_version_dir, target_version_dir, max_workers: int = 20
):
"""
Generate dependency report for newly introduced packages using parallel conda search calls for improved performance.
"""
# Get a list of newly introduced marquee packages in changeset and their versions.
_, new_packages = derive_changeset(target_version_dir, base_version_dir, image_config)

results = dict()

if not new_packages:
print("No new packages found for dependency report")
return

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_package = {
executor.submit(_search_package_dependency, package, version): package
for package, version in new_packages.items()
}

completed = 0
for future in as_completed(future_to_package):
completed += 1

try:
package_name, package_info = future.result()
results[package_name] = package_info

except Exception as e:
print(f"Unexpected error processing package dependency: {e}")

valid_results = {k: v for k, v in results.items() if isinstance(v, dict) and "version" in v and "depends" in v}

print(
create_markdown_table(
["Package", "Version in the Target Image", "Dependencies"],
Expand Down
31 changes: 13 additions & 18 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,25 +105,19 @@ def conda_search_with_retry(command, package, max_retries=5, base_delay=1):
"""
for attempt in range(max_retries):
try:
search_result = subprocess.run(
command,
capture_output=True,
text=True,
check=True,
timeout=60
)
search_result = subprocess.run(command, capture_output=True, text=True, check=True, timeout=60)
return search_result
except subprocess.TimeoutExpired:
if attempt == max_retries - 1:
print(f"Timeout searching for package {package} after {max_retries} attempts, ignore.")
return None
delay = base_delay * (2 ** attempt)
delay = base_delay * (2**attempt)
time.sleep(delay)
except subprocess.CalledProcessError as e:
if attempt == max_retries - 1:
print(f"Error searching for package {package} after {max_retries} attempts: {str(e)}")
return None
delay = base_delay * (2 ** attempt)
delay = base_delay * (2**attempt)
time.sleep(delay)
return None

Expand All @@ -134,13 +128,13 @@ def _search_single_package(package: str, match_spec_out) -> Tuple[str, Optional[
"""
package_version = str(match_spec_out.get("version")).removeprefix("==")
channel = match_spec_out.get("channel").channel_name

command = ["conda", "search", "-c", channel, f"{package}=={package_version}", "--json"]

search_result = conda_search_with_retry(command, package)
if search_result is None:
return package, None

try:
package_metadata = json.loads(search_result.stdout)[package][0]
result = {"version": package_metadata["version"], "size": package_metadata["size"]}
Expand All @@ -161,32 +155,33 @@ def pull_conda_package_metadata(image_config, image_artifact_dir, max_workers: i
target_packages_match_spec_out = {k: v for k, v in match_spec_out.items()}

conda_forge_packages = [
(package, match_spec_out) for package, match_spec_out in target_packages_match_spec_out.items()
(package, match_spec_out)
for package, match_spec_out in target_packages_match_spec_out.items()
if str(match_spec_out).startswith("conda-forge")
]

if not conda_forge_packages:
print("No conda-forge packages found")
return results

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_package = {
executor.submit(_search_single_package, package, match_spec): package
for package, match_spec in conda_forge_packages
}

completed = 0
for future in as_completed(future_to_package):
completed += 1

try:
package_name, package_metadata = future.result()
if package_metadata:
results[package_name] = package_metadata

except Exception as e:
print(f"Unexpected error processing package: {e}")

results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1]["size"], reverse=True)}

return results
Expand Down
26 changes: 6 additions & 20 deletions test/test_dockerfile_based_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,10 @@
("jupyter-server-proxy.test.Dockerfile", ["jupyter-server-proxy"]),
("ipywidgets.test.Dockerfile", ["ipywidgets"]),
("supervisor.test.Dockerfile", ["supervisor"]),
("xgboost-cpu.test.Dockerfile", ["xgboost"]),
("sagemaker_workflows.test.Dockerfile", ["sagemaker_workflows"]),
("sagemaker_workflows_artifacts.test.Dockerfile", ["sagemaker_workflows"]),
("sagemaker_studio.test.Dockerfile", ["sagemaker_studio"]),
("sagemaker_studio_cli.test.Dockerfile", ["sagemaker_studio_cli"]),
("xgboost-cpu.test.Dockerfile", ["py-xgboost-cpu"]),
("sagemaker_studio.test.Dockerfile", ["sagemaker-studio"]),
("sagemaker_studio_cli.test.Dockerfile", ["sagemaker-studio-cli"]),
("sm_spark_cli.test.Dockerfile", []),
("sagemaker_studio_dataengineering_sessions.test.Dockerfile", ["sagemaker-studio-dataengineering-sessions"]),
(
"sagemaker_studio_dataengineering_extensions.test.Dockerfile",
["sagemaker-studio-dataengineering-extensions"],
),
("sagemaker_studio.integ.Dockerfile", ["sagemaker_studio"]),
("strands.test.Dockerfile", ["strands-agents"]),
],
Expand Down Expand Up @@ -166,19 +159,12 @@ def test_dockerfiles_for_cpu(
("ipywidgets.test.Dockerfile", ["ipywidgets"]),
("supervisor.test.Dockerfile", ["supervisor"]),
("tf-keras.test.Dockerfile", ["tf-keras"]),
("xgboost-gpu.test.Dockerfile", ["xgboost"]),
("xgboost-gpu.test.Dockerfile", ["py-xgboost-gpu"]),
("jupyter-collaboration.test.Dockerfile", ["jupyter-collaboration"]),
("jupyter-server-proxy.test.Dockerfile", ["jupyter-server-proxy"]),
("sagemaker_workflows.test.Dockerfile", ["sagemaker_workflows"]),
("sagemaker_workflows_artifacts.test.Dockerfile", ["sagemaker_workflows"]),
("sagemaker_studio.test.Dockerfile", ["sagemaker_studio"]),
("sagemaker_studio_cli.test.Dockerfile", ["sagemaker_studio_cli"]),
("sagemaker_studio.test.Dockerfile", ["sagemaker-studio"]),
("sagemaker_studio_cli.test.Dockerfile", ["sagemaker-studio-cli"]),
("sm_spark_cli.test.Dockerfile", []),
("sagemaker_studio_dataengineering_sessions.test.Dockerfile", ["sagemaker-studio-dataengineering-sessions"]),
(
"sagemaker_studio_dataengineering_extensions.test.Dockerfile",
["sagemaker-studio-dataengineering-extensions"],
),
("sagemaker_studio.integ.Dockerfile", ["sagemaker_studio"]),
("strands.test.Dockerfile", ["strands-agents"]),
],
Expand Down
Loading