-
-
Notifications
You must be signed in to change notification settings - Fork 644
Improve package scan performance #4606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
489af1b
8d6fa73
4fc3af2
4438526
3bbf35e
e0460ef
dde6bc9
5cd5f74
6b6a79b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ set -e | |
|
|
||
| ./configure --dev | ||
| venv/bin/scancode-reindex-licenses | ||
| venv/bin/scancode-cache-package-patterns | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about naming this |
||
|
|
||
| python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" ) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -246,23 +246,32 @@ | |||||
| win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler, | ||||||
| ] | ||||||
|
|
||||||
|
|
||||||
| # These handlers are special as they use filetype to | ||||||
| # detect these binaries instead of datafile path patterns | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| # as these are optionally installed, we can skip checking | ||||||
| # for filetype if these are not available | ||||||
| BINARY_PACKAGE_DATAFILE_HANDLERS = [] | ||||||
|
|
||||||
| try: | ||||||
| from go_inspector.binary import get_go_binary_handler | ||||||
| APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler()) | ||||||
| handler = get_go_binary_handler() | ||||||
| BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) | ||||||
| except ImportError: | ||||||
| pass | ||||||
|
|
||||||
| try: | ||||||
| from rust_inspector.packages import get_rust_binary_handler | ||||||
| APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler()) | ||||||
| handler = get_rust_binary_handler() | ||||||
| BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) | ||||||
| except ImportError: | ||||||
| pass | ||||||
|
|
||||||
| ALL_DATAFILE_HANDLERS = ( | ||||||
| APPLICATION_PACKAGE_DATAFILE_HANDLERS + [ | ||||||
| p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS | ||||||
| if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS | ||||||
| ] | ||||||
| ] + BINARY_PACKAGE_DATAFILE_HANDLERS | ||||||
| ) | ||||||
|
|
||||||
| # registry of all handler classes keyed by datasource_id | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,236 @@ | ||||||||
| # | ||||||||
| # Copyright (c) nexB Inc. and others. All rights reserved. | ||||||||
| # ScanCode is a trademark of nexB Inc. | ||||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||||
| # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||||||||
| # See https://github.com/nexB/scancode-toolkit for support or download. | ||||||||
| # See https://aboutcode.org for more information about nexB OSS projects. | ||||||||
| # | ||||||||
|
|
||||||||
| import os | ||||||||
| import fnmatch | ||||||||
| import pickle | ||||||||
| import multiregex | ||||||||
|
|
||||||||
| import attr | ||||||||
| import click | ||||||||
|
|
||||||||
| from commoncode.cliutils import PluggableCommandLineOption | ||||||||
| from commoncode.fileutils import create_dir | ||||||||
| from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS | ||||||||
| from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS | ||||||||
|
|
||||||||
| from scancode_config import packagedcode_cache_dir | ||||||||
| from scancode_config import scancode_cache_dir | ||||||||
|
|
||||||||
| """ | ||||||||
| An on-disk persistent cache of package manifest patterns and related package | ||||||||
| manifest handlers mapping. Loading and dumping the cached package manifest | ||||||||
| patterns is safe to use across multiple processes using lock files. | ||||||||
| """ | ||||||||
|
|
||||||||
| # global in-memory cache of the PkgManifestPatternsCache | ||||||||
| _PACKAGE_CACHE = None | ||||||||
|
|
||||||||
| # This is the Pickle protocol we use, which was added in Python 3.4. | ||||||||
| PICKLE_PROTOCOL = 4 | ||||||||
|
|
||||||||
| PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 360 seconds is a lot. 60 secs should be enough. |
||||||||
| PACKAGE_INDEX_DIR = 'package_patterns_index' | ||||||||
| PACKAGE_INDEX_FILENAME = 'index_cache' | ||||||||
| PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile' | ||||||||
| PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums' | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not used anymore (also should be dropped from licensing)
Suggested change
|
||||||||
|
|
||||||||
|
|
||||||||
| @attr.s | ||||||||
| class PkgManifestPatternsCache: | ||||||||
| """ | ||||||||
| Represent cachable package manifest regex patterns, prematchers | ||||||||
| and mappings from regex patterns to datasource IDs for all datafile | ||||||||
| handlers. | ||||||||
| """ | ||||||||
|
|
||||||||
| handler_by_regex = attr.ib(default=attr.Factory(dict)) | ||||||||
| system_package_matcher = attr.ib(default=None) | ||||||||
| application_package_matcher = attr.ib(default=None) | ||||||||
| all_package_matcher = attr.ib(default=None) | ||||||||
|
|
||||||||
| @staticmethod | ||||||||
| def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns): | ||||||||
| return application_multiregex_patterns + [ | ||||||||
| multiregex_pattern | ||||||||
| for multiregex_pattern in system_multiregex_patterns | ||||||||
| if multiregex_pattern not in application_multiregex_patterns | ||||||||
| ] | ||||||||
|
|
||||||||
| @classmethod | ||||||||
| def load_or_build( | ||||||||
| cls, | ||||||||
| packagedcode_cache_dir=packagedcode_cache_dir, | ||||||||
| scancode_cache_dir=scancode_cache_dir, | ||||||||
| force=False, | ||||||||
| timeout=PACKAGE_INDEX_LOCK_TIMEOUT, | ||||||||
| system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, | ||||||||
| application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, | ||||||||
| ): | ||||||||
| """ | ||||||||
| Load or build and save and return a PkgManifestPatternsCache object. | ||||||||
| We either load a cached PkgManifestPatternsCache or build and cache the patterns. | ||||||||
| - If the cache exists, it is returned unless corrupted. | ||||||||
| - If ``force`` is True, or if the cache does not exist a new index is built | ||||||||
| and cached. | ||||||||
| """ | ||||||||
| idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR) | ||||||||
| create_dir(idx_cache_dir) | ||||||||
| cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME) | ||||||||
| has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file) | ||||||||
|
||||||||
|
|
||||||||
| # bypass build if cache exists | ||||||||
| if has_cache and not force: | ||||||||
| try: | ||||||||
| return load_cache_file(cache_file) | ||||||||
| except Exception as e: | ||||||||
| # work around some rare Windows quirks | ||||||||
| import traceback | ||||||||
| print('Inconsistent Package cache: rebuilding index.') | ||||||||
| print(str(e)) | ||||||||
| print(traceback.format_exc()) | ||||||||
|
|
||||||||
| from scancode import lockfile | ||||||||
| lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) | ||||||||
|
|
||||||||
| # here, we have no cache: lock, check and rebuild | ||||||||
| try: | ||||||||
| # acquire lock and wait until timeout to get a lock or die | ||||||||
| with lockfile.FileLock(lock_file).locked(timeout=timeout): | ||||||||
|
|
||||||||
| system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( | ||||||||
| datafile_handlers=system_package_datafile_handlers, | ||||||||
| ) | ||||||||
| application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( | ||||||||
| datafile_handlers=application_package_datafile_handlers, | ||||||||
| ) | ||||||||
| all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns( | ||||||||
| application_multiregex_patterns, system_multiregex_patterns, | ||||||||
| ) | ||||||||
| system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns) | ||||||||
| application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns) | ||||||||
| all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher) | ||||||||
| package_cache = cls( | ||||||||
| handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, | ||||||||
| system_package_matcher=system_package_matcher, | ||||||||
| application_package_matcher=application_package_matcher, | ||||||||
| all_package_matcher=all_package_matcher, | ||||||||
| ) | ||||||||
| package_cache.dump(cache_file) | ||||||||
| return package_cache | ||||||||
|
|
||||||||
| except lockfile.LockTimeout: | ||||||||
| # TODO: handle unable to lock in a nicer way | ||||||||
| raise | ||||||||
|
|
||||||||
| def dump(self, cache_file): | ||||||||
| """ | ||||||||
| Dump this license cache on disk at ``cache_file``. | ||||||||
| """ | ||||||||
| with open(cache_file, 'wb') as fn: | ||||||||
|
||||||||
| pickle.dump(self, fn, protocol=PICKLE_PROTOCOL) | ||||||||
|
|
||||||||
|
|
||||||||
| def get_prematchers_from_glob_pattern(pattern): | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a docstring and a few doctests ? |
||||||||
| return [ | ||||||||
| prematcher.lower().lstrip("/") | ||||||||
| for prematcher in pattern.split("*") | ||||||||
| if prematcher | ||||||||
| ] | ||||||||
|
|
||||||||
|
|
||||||||
| def build_mappings_and_multiregex_patterns(datafile_handlers): | ||||||||
| """ | ||||||||
| Return a mapping of regex patterns to datafile handler IDs and | ||||||||
| multiregex patterns consisting of regex patterns and prematchers. | ||||||||
| """ | ||||||||
| handler_by_regex = {} | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be a |
||||||||
| multiregex_patterns = [] | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about instead having a small attrs or dataclass object to use until a last minute conversion? class AcceleratedPattern:
regex :str # regular expression string
prematchers :list[str] # list of prematcher strinsg for this regex
handler_datasource_id" :str #handlersomething more or less like that would help avoid creating parallel list until the end. |
||||||||
|
|
||||||||
| if not datafile_handlers: | ||||||||
| return multiregex_patterns, handler_by_regex | ||||||||
|
|
||||||||
| with_patterns = [] | ||||||||
|
|
||||||||
| for handler in datafile_handlers: | ||||||||
| if handler.path_patterns: | ||||||||
| with_patterns.append(handler) | ||||||||
|
|
||||||||
| prematchers_by_regex = {} | ||||||||
|
|
||||||||
| for handler in with_patterns: | ||||||||
| for pattern in handler.path_patterns: | ||||||||
| regex_pattern = fnmatch.translate(pattern) | ||||||||
| regex_pattern = fr"{regex_pattern}" | ||||||||
|
|
||||||||
| prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern) | ||||||||
|
|
||||||||
| if regex_pattern in handler_by_regex: | ||||||||
| handler_by_regex[regex_pattern].append(handler.datasource_id) | ||||||||
| else: | ||||||||
| handler_by_regex[regex_pattern]= [handler.datasource_id] | ||||||||
|
|
||||||||
| for regex in handler_by_regex.keys(): | ||||||||
| regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also in keeping with the idea of using an object above, this could be a namedtuple from typing import NamedTuple
class RegexPrematchers(NamedTuple):
regex: str
prematchers: list[str] |
||||||||
| multiregex_patterns.append(regex_and_prematcher) | ||||||||
|
|
||||||||
| return multiregex_patterns, handler_by_regex | ||||||||
|
|
||||||||
|
|
||||||||
| def get_cache( | ||||||||
| force=False, | ||||||||
| packagedcode_cache_dir=packagedcode_cache_dir, | ||||||||
| scancode_cache_dir=scancode_cache_dir, | ||||||||
| ): | ||||||||
| """ | ||||||||
| Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. | ||||||||
| """ | ||||||||
| global _PACKAGE_CACHE | ||||||||
|
|
||||||||
| if force or not _PACKAGE_CACHE: | ||||||||
| _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build( | ||||||||
| packagedcode_cache_dir=packagedcode_cache_dir, | ||||||||
| scancode_cache_dir=scancode_cache_dir, | ||||||||
| force=force, | ||||||||
| # used for testing only | ||||||||
| timeout=PACKAGE_INDEX_LOCK_TIMEOUT, | ||||||||
| ) | ||||||||
| return _PACKAGE_CACHE | ||||||||
|
|
||||||||
|
|
||||||||
| def load_cache_file(cache_file): | ||||||||
| """ | ||||||||
| Return a PkgManifestPatternsCache loaded from ``cache_file``. | ||||||||
| """ | ||||||||
| with open(cache_file, 'rb') as lfc: | ||||||||
|
||||||||
| try: | ||||||||
| return pickle.load(lfc) | ||||||||
|
||||||||
| except Exception as e: | ||||||||
| msg = ( | ||||||||
| 'ERROR: Failed to load package cache (the file may be corrupted ?).\n' | ||||||||
| f'Please delete "{cache_file}" and retry.\n' | ||||||||
| 'If the problem persists, copy this error message ' | ||||||||
| 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' | ||||||||
| ) | ||||||||
| raise Exception(msg) from e | ||||||||
|
|
||||||||
|
|
||||||||
| @click.command(name='scancode-cache-package-patterns') | ||||||||
| @click.help_option('-h', '--help') | ||||||||
| def cache_package_patterns(*args, **kwargs): | ||||||||
| """Create scancode package manifest patterns cache and exit""" | ||||||||
| click.echo('Rebuilding the package cache patterns...') | ||||||||
| get_cache(force=True) | ||||||||
| click.echo('Done.') | ||||||||
|
|
||||||||
|
|
||||||||
| if __name__ == '__main__': | ||||||||
| cache_package_patterns() | ||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| /cache/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about this:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or