diff --git a/src/fetchcode/composer.py b/src/fetchcode/composer.py new file mode 100644 index 0000000..32b73f0 --- /dev/null +++ b/src/fetchcode/composer.py @@ -0,0 +1,57 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from packageurl import PackageURL + +from fetchcode import fetch_json_response + + +class Composer: + + purl_pattern = "pkg:composer/.*" + base_url = "https://repo.packagist.org" + + @classmethod + def get_download_url(cls, purl): + + """ + Return the download URL for a Composer PURL. + """ + purl = PackageURL.from_string(purl) + + if not purl.name or not purl.version: + raise ValueError("Composer PURL must specify a name and version") + + name = f"{purl.namespace}/{purl.name}" if purl.namespace else purl.name + + url = f"{cls.base_url}/p2/{name}.json" + data = fetch_json_response(url) + + if "packages" not in data: + return + + if name not in data["packages"]: + return + + for package in data["packages"][name]: + if ( + package.get("version") == purl.version + or package.get("version") == f"v{purl.version}" + or package.get("version_normalized") == purl.version + or package.get("version_normalized") == f"v{purl.version}" + ): + download_url = package["dist"].get("url") + return download_url diff --git a/src/fetchcode/cpan.py b/src/fetchcode/cpan.py new file mode 100644 index 0000000..8e446ec --- /dev/null +++ b/src/fetchcode/cpan.py @@ -0,0 +1,58 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import urllib.parse + +from packageurl import PackageURL + +from fetchcode import fetch_json_response +from fetchcode.utils import _http_exists + + +class CPAN: + purl_pattern = "pkg:cpan/.*" + base_url = "https://cpan.metacpan.org/" + + def get_download_url(purl: str): + """ + Resolve a CPAN PURL to a verified, downloadable archive URL. + Strategy: MetaCPAN API -> verified URL; fallback to author-based path if available. + """ + p = PackageURL.from_string(purl) + if not p.name or not p.version: + return None + + parsed_name = urllib.parse.quote(p.name) + parsed_version = urllib.parse.quote(p.version) + api = f"https://fastapi.metacpan.org/v1/release/{parsed_name}/{parsed_version}" + if _http_exists(api): + # Fetch release data from MetaCPAN API + # Example: https://fastapi.metacpan.org/v1/release/Some-Module/1.2.3 + data = fetch_json_response(url=api) + url = data.get("download_url") or data.get("archive") + if url and _http_exists(url): + return url + + author = p.namespace + if not author: + return + auth = author.upper() + a = auth[0] + ab = auth[:2] if len(auth) >= 2 else auth + for ext in (".tar.gz", ".zip"): + url = f"https://cpan.metacpan.org/authors/id/{a}/{ab}/{auth}/{p.name}-{p.version}{ext}" + if _http_exists(url): + return url diff --git a/src/fetchcode/cran.py b/src/fetchcode/cran.py new file mode 100644 index 0000000..52a49ed --- /dev/null +++ b/src/fetchcode/cran.py @@ -0,0 +1,46 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from packageurl import PackageURL + +from fetchcode.utils import _http_exists + + +class CRAN: + """ + This class handles CRAN PURLs. + """ + + purl_pattern = "pkg:cran/.*" + base_url = "https://cran.r-project.org" + + @classmethod + def get_download_url(cls, purl: str): + """ + Resolve a CRAN PURL to a verified, downloadable source tarball URL. + Tries current contrib first, then Archive. + """ + p = PackageURL.from_string(purl) + if not p.name or not p.version: + return None + + current_url = f"{cls.base_url}/src/contrib/{p.name}_{p.version}.tar.gz" + if _http_exists(current_url): + return current_url + + archive_url = f"{cls.base_url}/src/contrib/Archive/{p.name}/{p.name}_{p.version}.tar.gz" + if _http_exists(archive_url): + return archive_url diff --git a/src/fetchcode/download_urls.py b/src/fetchcode/download_urls.py index 8594006..c0e89b3 100644 --- a/src/fetchcode/download_urls.py +++ b/src/fetchcode/download_urls.py @@ -17,11 +17,13 @@ from packageurl.contrib.route import NoRouteAvailable from packageurl.contrib.route import Router +from fetchcode.composer import Composer +from fetchcode.cpan import CPAN +from fetchcode.cran import CRAN +from fetchcode.huggingface import Huggingface from fetchcode.pypi import Pypi -package_registry = [ - Pypi, -] +package_registry = [Pypi, CRAN, CPAN, Huggingface, Composer] router = Router() diff --git a/src/fetchcode/huggingface.py b/src/fetchcode/huggingface.py new file mode 100644 index 0000000..c3b63c1 --- /dev/null +++ b/src/fetchcode/huggingface.py @@ -0,0 +1,53 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from packageurl import PackageURL + +from fetchcode import fetch_json_response + + +class Huggingface: + """ + This class handles huggingface PURLs. + """ + + purl_pattern = "pkg:huggingface/.*" + + @classmethod + def get_download_url(cls, purl: str): + """ + Return the download URL for a Hugging Face PURL. + """ + p = PackageURL.from_string(purl) + if not p.name: + return None + + revision = p.version or "main" + model_id = f"{p.namespace}/{p.name}" if p.namespace else p.name + q = p.qualifiers or {} + + api_url = f"https://huggingface.co/api/models/{model_id}?revision={revision}" + data = fetch_json_response(api_url) + siblings = data.get("siblings", []) + + ALLOWED_EXECUTABLE_EXTS = (".bin",) + + for sib in siblings: + file_name = sib.get("rfilename") + if not file_name.endswith(ALLOWED_EXECUTABLE_EXTS): + continue + url = f"https://huggingface.co/{model_id}/resolve/{revision}/{file_name}" + return url diff --git a/src/fetchcode/pypi.py b/src/fetchcode/pypi.py index 96b6916..b8c4038 100644 --- a/src/fetchcode/pypi.py +++ b/src/fetchcode/pypi.py @@ -23,7 +23,7 @@ class Pypi: """ - This class handles Cargo PURLs. + This class handles Pypi PURLs. """ purl_pattern = "pkg:pypi/.*" diff --git a/src/fetchcode/utils.py b/src/fetchcode/utils.py index 81ac9df..43345d9 100644 --- a/src/fetchcode/utils.py +++ b/src/fetchcode/utils.py @@ -243,3 +243,14 @@ def get_first_three_md5_hash_characters(podname): create a hash (using md5) of it and take the first three characters." """ return md5_hasher(podname.encode("utf-8")).hexdigest()[0:3] + + +def _http_exists(url: str) -> bool: + """ + Lightweight existence check using a ranged GET so CDNs/servers that ignore HEAD still work. + """ + try: + resp = make_head_request(url, headers={"Range": "bytes=0-0"}) + return resp is not None and resp.status_code in (200, 206) + except Exception: + return False diff --git a/tests/test_composer.py b/tests/test_composer.py new file mode 100644 index 0000000..443c681 --- /dev/null +++ b/tests/test_composer.py @@ -0,0 +1,81 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from unittest.mock import patch + +import pytest + +from fetchcode.composer import Composer + + +def test_valid_composer_package_with_namespace(): + purl = "pkg:composer/laravel/framework@10.0.0" + name = "laravel/framework" + expected_url = f"https://repo.packagist.org/p2/{name}.json" + download_url = "https://github.com/laravel/framework/archive/refs/tags/v10.0.0.zip" + + mock_data = {"packages": {name: [{"version": "10.0.0", "dist": {"url": download_url}}]}} + + with patch("fetchcode.composer.fetch_json_response", return_value=mock_data) as mock_fetch: + result = Composer.get_download_url(purl) + assert result == download_url + mock_fetch.assert_called_once_with(expected_url) + + +def test_valid_composer_package_without_namespace(): + purl = "pkg:composer/some-package@1.0.0" + name = "some-package" + expected_url = f"https://repo.packagist.org/p2/{name}.json" + download_url = "https://example.org/some-package-1.0.0.zip" + + mock_data = {"packages": {name: [{"version": "1.0.0", "dist": {"url": download_url}}]}} + + with patch("fetchcode.composer.fetch_json_response", return_value=mock_data) as mock_fetch: + result = Composer.get_download_url(purl) + assert result == download_url + mock_fetch.assert_called_once_with(expected_url) + + +def test_version_not_found_returns_none(): + purl = "pkg:composer/laravel/framework@10.0.0" + name = "laravel/framework" + mock_data = {"packages": {name: [{"version": "9.0.0", "dist": {"url": "https://old.zip"}}]}} + + with patch("fetchcode.composer.fetch_json_response", return_value=mock_data): + result = Composer.get_download_url(purl) + assert result is None + + +def test_missing_packages_key_returns_none(): + purl = "pkg:composer/laravel/framework@10.0.0" + with patch("fetchcode.composer.fetch_json_response", return_value={}): + result = Composer.get_download_url(purl) + assert result is None + + +def test_missing_package_name_in_data_returns_none(): + purl = "pkg:composer/laravel/framework@10.0.0" + mock_data = {"packages": {"some/other": []}} + + with patch("fetchcode.composer.fetch_json_response", return_value=mock_data): + result = Composer.get_download_url(purl) + assert result is None + + +def test_missing_version_raises(): + purl = "pkg:composer/laravel/framework" + with pytest.raises(ValueError, match="Composer PURL must specify a name and version"): + Composer.get_download_url(purl) diff --git a/tests/test_cpan.py b/tests/test_cpan.py new file mode 100644 index 0000000..374af41 --- /dev/null +++ b/tests/test_cpan.py @@ -0,0 +1,86 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from unittest.mock import patch + +import pytest + +from fetchcode.cpan import CPAN + +get_download_url = CPAN.get_download_url + + +@pytest.fixture +def valid_purl(): + return "pkg:cpan/EXAMPLE/Some-Module@1.2.3" + + +def test_success_from_metacpan_api(valid_purl): + expected_url = "https://cpan.metacpan.org/authors/id/E/EX/EXAMPLE/Some-Module-1.2.3.tar.gz" + + with patch("fetchcode.cpan.fetch_json_response") as mock_fetch, patch( + "fetchcode.cpan._http_exists" + ) as mock_exists: + mock_fetch.return_value = {"download_url": expected_url} + mock_exists.return_value = True + result = get_download_url(valid_purl) + assert result == expected_url + mock_fetch.assert_called_once() + assert mock_exists.call_count == 2 + + +def test_fallback_to_author_path(valid_purl): + expected_url = "https://cpan.metacpan.org/authors/id/E/EX/EXAMPLE/Some-Module-1.2.3.tar.gz" + + with patch("fetchcode.cpan.fetch_json_response", side_effect=Exception("API error")), patch( + "fetchcode.cpan._http_exists" + ) as mock_exists: + + mock_exists.side_effect = lambda url: url.endswith(".tar.gz") + + result = get_download_url(valid_purl) + assert result == expected_url + assert mock_exists.call_count >= 1 + + +def test_author_zip_fallback(valid_purl): + tar_url = "https://cpan.metacpan.org/authors/id/E/EX/EXAMPLE/Some-Module-1.2.3.tar.gz" + zip_url = "https://cpan.metacpan.org/authors/id/E/EX/EXAMPLE/Some-Module-1.2.3.zip" + + with patch("fetchcode.cpan.fetch_json_response", return_value={}), patch( + "fetchcode.cpan._http_exists" + ) as mock_exists: + + mock_exists.side_effect = lambda url: url == zip_url + + result = get_download_url(valid_purl) + assert result == zip_url + assert mock_exists.call_count == 3 + assert tar_url in [call[0][0] for call in mock_exists.call_args_list] + + +def test_neither_api_nor_fallback_works(valid_purl): + with patch("fetchcode.cpan.fetch_json_response", return_value={}), patch( + "fetchcode.cpan._http_exists", return_value=False + ) as mock_exists: + + result = get_download_url(valid_purl) + assert result is None + assert mock_exists.call_count == 3 + + +def test_missing_name_or_version(): + assert get_download_url("pkg:cpan/EXAMPLE/Some-Module") is None diff --git a/tests/test_cran.py b/tests/test_cran.py new file mode 100644 index 0000000..021df51 --- /dev/null +++ b/tests/test_cran.py @@ -0,0 +1,86 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from unittest.mock import patch + +import pytest + +from fetchcode.cran import CRAN + +get_download_url = CRAN.get_download_url + + +@pytest.fixture +def valid_purl(): + return "pkg:cran/dplyr@1.0.0" + + +def test_current_url_exists(valid_purl): + current_url = "https://cran.r-project.org/src/contrib/dplyr_1.0.0.tar.gz" + + with patch("fetchcode.cran._http_exists", return_value=True) as mock_check: + result = get_download_url(valid_purl) + assert result == current_url + mock_check.assert_called_once_with(current_url) + + +def test_fallback_to_archive(valid_purl): + current_url = "https://cran.r-project.org/src/contrib/dplyr_1.0.0.tar.gz" + archive_url = "https://cran.r-project.org/src/contrib/Archive/dplyr/dplyr_1.0.0.tar.gz" + + def side_effect(url): + return url == archive_url + + with patch("fetchcode.cran._http_exists", side_effect=side_effect) as mock_check: + result = get_download_url(valid_purl) + assert result == archive_url + assert mock_check.call_count == 2 + mock_check.assert_any_call(current_url) + mock_check.assert_any_call(archive_url) + + +def test_neither_url_exists(valid_purl): + with patch("fetchcode.cran._http_exists", return_value=False) as mock_check: + result = get_download_url(valid_purl) + assert result is None + assert mock_check.call_count == 2 + + +def test_missing_version_returns_none(): + result = get_download_url("pkg:cran/dplyr") + assert result is None + + +def test_version_with_dash(): + purl = "pkg:cran/somepkg@1.2-3" + + with patch("fetchcode.cran._http_exists", return_value=True) as mock_check: + result = get_download_url(purl) + assert result == "https://cran.r-project.org/src/contrib/somepkg_1.2-3.tar.gz" + mock_check.assert_called_once_with( + "https://cran.r-project.org/src/contrib/somepkg_1.2-3.tar.gz" + ) + + +def test_name_with_dot(): + purl = "pkg:cran/foo.bar@2.0.1" + + with patch("fetchcode.cran._http_exists", return_value=True) as mock_check: + result = get_download_url(purl) + assert result == "https://cran.r-project.org/src/contrib/foo.bar_2.0.1.tar.gz" + mock_check.assert_called_once_with( + "https://cran.r-project.org/src/contrib/foo.bar_2.0.1.tar.gz" + ) diff --git a/tests/test_download_urls.py b/tests/test_download_urls.py new file mode 100644 index 0000000..464de85 --- /dev/null +++ b/tests/test_download_urls.py @@ -0,0 +1,46 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from unittest.mock import patch + +import pytest +from packageurl.contrib.route import NoRouteAvailable + +from fetchcode.download_urls import download_url +from fetchcode.download_urls import router + + +def test_right_class_being_called_for_the_purls(): + purls = [ + "pkg:pypi/requests@2.31.0", + "pkg:cpan/EXAMPLE/Some-Module@1.2.3", + "pkg:composer/laravel/framework@10.0.0", + "pkg:cran/dplyr@1.0.0", + ] + + with patch("fetchcode.download_urls.Router.process") as mock_fetch: + for purl in purls: + assert download_url(purl) is not None, f"Failed for purl: {purl}" + + +def test_with_invalid_purls(): + invalid_purls = [ + "pkg:invalid/requests", + "pkg:xyz/dplyr", + ] + for purl in invalid_purls: + with pytest.raises(NoRouteAvailable): + router.process(purl) diff --git a/tests/test_huggingface.py b/tests/test_huggingface.py new file mode 100644 index 0000000..45fbf6c --- /dev/null +++ b/tests/test_huggingface.py @@ -0,0 +1,65 @@ +# fetchcode is a free software tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/fetchcode for support and download. +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and http://aboutcode.org +# +# This software is licensed under the Apache License version 2.0. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from unittest.mock import patch + +from fetchcode.huggingface import Huggingface + + +def test_returns_bin_file_url(): + purl = "pkg:huggingface/facebook/opt-350m" + revision = "main" + expected_url = "https://huggingface.co/facebook/opt-350m/resolve/main/pytorch_model.bin" + + mock_data = { + "siblings": [ + {"rfilename": "config.json"}, + {"rfilename": "pytorch_model.bin"}, + ] + } + + with patch("fetchcode.huggingface.fetch_json_response", return_value=mock_data): + result = Huggingface.get_download_url(purl) + assert result == expected_url + + +def test_no_executable_files_returns_none(): + purl = "pkg:huggingface/facebook/opt-350m" + mock_data = { + "siblings": [ + {"rfilename": "config.json"}, + {"rfilename": "tokenizer.json"}, + ] + } + + with patch("fetchcode.huggingface.fetch_json_response", return_value=mock_data): + result = Huggingface.get_download_url(purl) + assert result is None + + +def test_custom_revision_in_purl(): + purl = "pkg:huggingface/facebook/opt-350m@v1.0" + expected_url = "https://huggingface.co/facebook/opt-350m/resolve/v1.0/pytorch_model.bin" + + mock_data = { + "siblings": [ + {"rfilename": "pytorch_model.bin"}, + ] + } + + with patch("fetchcode.huggingface.fetch_json_response", return_value=mock_data): + result = Huggingface.get_download_url(purl) + assert result == expected_url