From 4b7f2c01877304e0796fded7c8fa5dce588aa254 Mon Sep 17 00:00:00 2001 From: Gary O'Neall Date: Fri, 24 Oct 2025 18:13:22 -0700 Subject: [PATCH] Improve performance of duplicate comparisons Only tokenize the license compares once. Improves performance approximately 2 orders of magnitude when processing the entire license list. Performance improvements are negligable for single licenses. Add a timeout to the future gets of cross reference URL details. During testing, this fetch stalled once - this change will create a warning but will prevent an application hang. --- .../LicenseRDFAGenerator.java | 99 ++++++++++++++----- ...XmlLicenseProviderWithCrossRefDetails.java | 14 +-- 2 files changed, 82 insertions(+), 31 deletions(-) diff --git a/src/org/spdx/licenselistpublisher/LicenseRDFAGenerator.java b/src/org/spdx/licenselistpublisher/LicenseRDFAGenerator.java index 836d674..e7b1483 100644 --- a/src/org/spdx/licenselistpublisher/LicenseRDFAGenerator.java +++ b/src/org/spdx/licenselistpublisher/LicenseRDFAGenerator.java @@ -43,6 +43,7 @@ import org.apache.commons.io.FileUtils; import org.spdx.core.InvalidSPDXAnalysisException; import org.spdx.crossref.CrossRefHelper; +import org.spdx.library.ListedLicenses; import org.spdx.library.SpdxModelFactory; import org.spdx.library.model.v2.SpdxConstantsCompatV2; import org.spdx.library.model.v2.license.SpdxListedLicenseException; @@ -51,7 +52,6 @@ import org.spdx.licenselistpublisher.licensegenerator.*; import org.spdx.licensexml.XmlLicenseProviderSingleFile; import org.spdx.licensexml.XmlLicenseProviderWithCrossRefDetails; -import org.spdx.utility.compare.LicenseCompareHelper; import org.spdx.utility.compare.SpdxCompareException; import com.google.gson.Gson; @@ -565,7 +565,7 @@ private static Set writeLicenseList(String version, String releaseDate, throws LicenseGeneratorException, InvalidSPDXAnalysisException, IOException, SpdxListedLicenseException, SpdxCompareException, InvalidLicenseTemplateException { Iterator licenseIter = licenseProvider.getLicenseIterator(); try { - Map addedLicIdTextMap = new HashMap<>(); // keep track for duplicate checking + Map addedLicIdTextMap = new HashMap<>(); // keep track for duplicate checking while (licenseIter.hasNext()) { System.out.print("."); ListedLicenseContainer licenseContainer = licenseIter.next(); @@ -576,21 +576,10 @@ private static Set writeLicenseList(String version, String releaseDate, addExternalMetaData(licenseContainer); String licenseId = licenseContainer.getV2ListedLicense().getLicenseId(); if (licenseId != null && !licenseId.isEmpty()) { - // Check for duplicate licenses - if (!licenseContainer.getV2ListedLicense().isDeprecated()) { - Iterator> addedLicenseTextIter = addedLicIdTextMap.entrySet().iterator(); - while (addedLicenseTextIter.hasNext()) { - Entry entry = addedLicenseTextIter.next(); - if (LicenseTextHelper.isLicenseTextEquivalent(entry.getValue(), licenseContainer.getV2ListedLicense().getLicenseText())) { - warnings.add("Duplicates licenses: "+licenseContainer.getV2ListedLicense().getLicenseId()+", "+entry.getKey()); - } - } - addedLicIdTextMap.put(licenseId, licenseContainer.getV2ListedLicense().getLicenseText()); - } checkText(licenseContainer.getV2ListedLicense().getLicenseText(), "License text for "+licenseId, warnings); if (tester != null) { List testResults = tester.testLicense(licenseContainer); - if (testResults != null && testResults.size() > 0) { + if (testResults != null && !testResults.isEmpty()) { for (String testResult:testResults) { warnings.add("Test for license "+licenseId + " failed: "+testResult); } @@ -602,6 +591,18 @@ private static Set writeLicenseList(String version, String releaseDate, } } } + // Check for duplicate licenses + if (!licenseContainer.getV2ListedLicense().isDeprecated()) { + String[] licenseTokens = LicenseTextHelper.tokenizeLicenseText( + licenseContainer.getV2ListedLicense().getLicenseText(), + new HashMap<>()); + for (Entry entry : addedLicIdTextMap.entrySet()) { + if (isLicenseTextEquivalent(entry.getValue(), licenseTokens)) { + warnings.add("Duplicates licenses: " + licenseContainer.getV2ListedLicense().getLicenseId() + ", " + entry.getKey()); + } + } + addedLicIdTextMap.put(licenseId, licenseTokens); + } for (ILicenseFormatWriter writer : writers) { if (writer instanceof LicenseTextFormatWriter) { ((LicenseTextFormatWriter)(writer)).writeLicense(licenseContainer, @@ -616,21 +617,20 @@ private static Set writeLicenseList(String version, String releaseDate, } if (addedLicIdTextMap.size() == 1) { // Since we are only creating a single file, we should check the listed licenses for duplicates - addedLicIdTextMap.entrySet().forEach(entry -> { - String[] matchingLicenseIds; + addedLicIdTextMap.forEach((key, value) -> { try { - matchingLicenseIds = LicenseCompareHelper.matchingStandardLicenseIds(entry.getValue()); - for (String matchingId:matchingLicenseIds) { - if (!entry.getKey().equals(matchingId)) { - warnings.add("Duplicates licenses: "+entry.getKey()+", "+matchingId); + for (String stdLicenseId : ListedLicenses.getListedLicenses().getSpdxListedLicenseIds()) { + String[] stdLicenseTokens = LicenseTextHelper.tokenizeLicenseText( + ListedLicenses.getListedLicenses().getListedLicenseByIdCompatV2(stdLicenseId).getLicenseText(), + new HashMap<>()); + if (isLicenseTextEquivalent(value, stdLicenseTokens)) { + warnings.add("Duplicates licenses: " + key + ", " + stdLicenseId); } } } catch (InvalidSPDXAnalysisException e) { - warnings.add("Error comparing single license to existing listed licenses: "+e.getMessage()); - } catch (SpdxCompareException e) { - warnings.add("Error comparing single license to existing listed licenses: "+e.getMessage()); + warnings.add("Error comparing single license to existing listed licenses: " + e.getMessage()); } - }); + }); } return addedLicIdTextMap.keySet(); } finally { @@ -641,6 +641,57 @@ private static Set writeLicenseList(String version, String releaseDate, } } + /** + * Returns true if two sets of license tokens is considered a match per + * the SPDX License matching guidelines documented at spdx.org (currently license matching guidelines) + * There are 2 unimplemented features - bullets/numbering is not considered and comments with no whitespace between text is not skipped + * @param licenseATokens normalized license tokens to compare + * @param licenseBTokens normalized license tokens to compare + * @return true if the license text is equivalent + */ + public static boolean isLicenseTextEquivalent(String[] licenseATokens, String[] licenseBTokens) { + //TODO: Move this to LicenseTextHelper and refactor + int bTokenCounter = 0; + int aTokenCounter = 0; + String nextAToken = LicenseTextHelper.getTokenAt(licenseATokens, aTokenCounter++); + String nextBToken = LicenseTextHelper.getTokenAt(licenseBTokens, bTokenCounter++); + while (nextAToken != null) { + if (nextBToken == null) { + // end of b stream + while (LicenseTextHelper.canSkip(nextAToken)) { + nextAToken = LicenseTextHelper.getTokenAt(licenseATokens, aTokenCounter++); + } + if (nextAToken != null) { + return false; // there is more stuff in the license text B, so not equal + } + } else if (LicenseTextHelper.tokensEquivalent(nextAToken, nextBToken)) { + // just move onto the next set of tokens + nextAToken = LicenseTextHelper.getTokenAt(licenseATokens, aTokenCounter++); + nextBToken = LicenseTextHelper.getTokenAt(licenseBTokens, bTokenCounter++); + } else { + // see if we can skip through some B tokens to find a match + while (LicenseTextHelper.canSkip(nextBToken)) { + nextBToken = LicenseTextHelper.getTokenAt(licenseBTokens, bTokenCounter++); + } + // just to be sure, skip forward on the A license + while (LicenseTextHelper.canSkip(nextAToken)) { + nextAToken = LicenseTextHelper.getTokenAt(licenseATokens, aTokenCounter++); + } + if (!LicenseTextHelper.tokensEquivalent(nextAToken, nextBToken)) { + return false; + } else { + nextAToken = LicenseTextHelper.getTokenAt(licenseATokens, aTokenCounter++); + nextBToken = LicenseTextHelper.getTokenAt(licenseBTokens, bTokenCounter++); + } + } + } + // need to make sure B is at the end + while (LicenseTextHelper.canSkip(nextBToken)) { + nextBToken = LicenseTextHelper.getTokenAt(licenseBTokens, bTokenCounter++); + } + return (nextBToken == null); + } + /** * Update license fields based on information from external metadata * @param licenseContainer diff --git a/src/org/spdx/licensexml/XmlLicenseProviderWithCrossRefDetails.java b/src/org/spdx/licensexml/XmlLicenseProviderWithCrossRefDetails.java index 4d33063..1b71ed9 100644 --- a/src/org/spdx/licensexml/XmlLicenseProviderWithCrossRefDetails.java +++ b/src/org/spdx/licensexml/XmlLicenseProviderWithCrossRefDetails.java @@ -25,10 +25,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Objects; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; +import java.util.concurrent.*; import org.spdx.core.IModelCopyManager; import org.spdx.core.InvalidSPDXAnalysisException; @@ -97,7 +94,7 @@ public synchronized ListedLicenseContainer next() { ListedLicenseContainer retval = readyLicense.getKey(); try { - for (CrossRef crossRef:readyLicense.getValue().get()) { + for (CrossRef crossRef:readyLicense.getValue().get(2, TimeUnit.MINUTES)) { retval.getV2ListedLicense().getCrossRef().add(crossRef); } @@ -107,8 +104,11 @@ public synchronized ListedLicenseContainer next() { } catch (InvalidSPDXAnalysisException e) { logger.error("Error setting cross refs",e); warnings.add("Unable to set cross references due to error: "+e.getMessage()); - } - urlDetailsInProgress.remove(retval); + } catch (TimeoutException e) { + logger.error("Timeout getting URL value. URL values will not be filled in for license ID "+retval.getV2ListedLicense().getLicenseId(),e); + warnings.add("Timeout getting URL value. URL values will not be filled in for license ID "+retval.getV2ListedLicense().getLicenseId()); + } + urlDetailsInProgress.remove(retval); fillCrossRefPool(); return retval; }