diff --git a/pom.xml b/pom.xml index 1823a16..e9a96bf 100644 --- a/pom.xml +++ b/pom.xml @@ -12,13 +12,16 @@ org.springframework.boot spring-boot-starter-parent - 3.3.3 + 3.4.5 17 17.0.10 + 10.1.54 + 6.2.11 + 6.5.9 UTF-8 @@ -91,7 +94,6 @@ org.apache.httpcomponents.client5 httpclient5 - 5.3.1 @@ -114,7 +116,6 @@ org.apache.tomcat.embed tomcat-embed-core - 10.1.30 @@ -132,7 +133,6 @@ org.springframework spring-core - 6.1.3 @@ -148,13 +148,11 @@ org.springframework.security spring-security-web - 6.3.0 org.apache.tomcat.embed tomcat-embed-websocket - 11.0.0-M26 @@ -166,19 +164,16 @@ org.springframework spring-web - 6.1.12 org.springframework.boot spring-boot - 3.3.3 org.springframework.security spring-security-core - 6.3.0 diff --git a/src/main/java/org/taniwha/controller/AnalyticsController.java b/src/main/java/org/taniwha/controller/AnalyticsController.java index 3d38cb2..9151449 100644 --- a/src/main/java/org/taniwha/controller/AnalyticsController.java +++ b/src/main/java/org/taniwha/controller/AnalyticsController.java @@ -7,6 +7,7 @@ import org.springframework.web.bind.annotation.*; import org.taniwha.dto.*; import org.taniwha.service.jobs.AnalyticsProcessingJobs; +import org.taniwha.service.AnalyticsAuditService; import org.taniwha.service.AnalyticsService; import java.util.Collections; @@ -21,16 +22,21 @@ public class AnalyticsController { private final AnalyticsService analyticsService; private final AnalyticsProcessingJobs jobs; + private final AnalyticsAuditService auditService; - public AnalyticsController(AnalyticsService analyticsService, AnalyticsProcessingJobs jobs) { + public AnalyticsController(AnalyticsService analyticsService, + AnalyticsProcessingJobs jobs, + AnalyticsAuditService auditService) { this.analyticsService = analyticsService; this.jobs = jobs; + this.auditService = auditService; } @PostMapping("/processList") public ResponseEntity processList(@RequestBody FileNamesDTO dto) { try { List fileNames = dto.getFileNames(); + auditService.logRequest("PROCESS", fileNames); boolean huge = analyticsService.isAnyHugeForDiscovery(fileNames); if (!huge) { @@ -100,6 +106,7 @@ public ResponseEntity recalculateFeatureList( @RequestParam("featureType") String featureType ) { logger.debug("File reprocessing request: {} as type {} for file: {}", featureName, featureType, fileName); + auditService.logRequest("REPROCESS", fileName); if (!featureType.equalsIgnoreCase("continuous") && !featureType.equalsIgnoreCase("categorical")) { logger.warn("Invalid feature type provided: {}", featureType); return ResponseEntity.badRequest().body(new AnalyticsResponseDTO("Invalid feature type")); @@ -123,6 +130,11 @@ public ResponseEntity recalculateFeatureList( @PostMapping("/filterByNameList") public ResponseEntity> filterByNameList(@RequestBody MultiFileFilterRequest payload) { logger.debug("Received multiple-file filter request with {} entries", payload.getMultipleFileFilters().size()); + List fileNames = payload.getMultipleFileFilters().stream() + .map(ff -> ff.getFileName()).toList(); + boolean anyFilters = payload.getMultipleFileFilters().stream() + .anyMatch(ff -> ff.getFilters() != null && !ff.getFilters().isEmpty()); + auditService.logRequest("FILTER", fileNames, anyFilters); try { List filteredList = analyticsService.filterMultipleFilesByName(payload.getMultipleFileFilters()); return ResponseEntity.ok(filteredList); diff --git a/src/main/java/org/taniwha/service/AnalyticsAuditService.java b/src/main/java/org/taniwha/service/AnalyticsAuditService.java new file mode 100644 index 0000000..93f0571 --- /dev/null +++ b/src/main/java/org/taniwha/service/AnalyticsAuditService.java @@ -0,0 +1,127 @@ +package org.taniwha.service; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.security.core.Authentication; +import org.springframework.security.core.context.SecurityContextHolder; +import org.springframework.stereotype.Service; + +import java.time.Instant; +import java.util.Collection; +import java.util.Map; + +/** + * Records structured audit entries for every analytics and filtering operation. + * + *

Entries are written to the dedicated {@code AUDIT} SLF4J logger, which is + * routed to its own rolling file (see {@code logback.xml}). Each entry is a + * single line in key=value format so it can be ingested by log-aggregation + * tools without additional parsing configuration. + * + *

Fields written per entry: + *

    + *
  • {@code ts} – ISO-8601 UTC timestamp
  • + *
  • {@code op} – operation name (e.g. {@code PROCESS}, {@code FILTER})
  • + *
  • {@code file} – dataset file name (or comma-separated list)
  • + *
  • {@code filters} – {@code true}/{@code false} whether filters were applied
  • + *
  • {@code records} – number of records in the result (request entries omit this)
  • + *
  • {@code suppressed} – features suppressed by disclosure controls
  • + *
  • {@code principal} – authenticated user name, or {@code anonymous}
  • + *
+ */ +@Service +public class AnalyticsAuditService { + + private static final Logger auditLogger = LoggerFactory.getLogger("AUDIT"); + + // ------------------------------------------------------------------------- + // Request-level audit entries (logged in the controller) + // ------------------------------------------------------------------------- + + /** + * Logs the start of a multi-file analytics request (no filter conditions). + * + * @param operation human-readable operation label (e.g. {@code "PROCESS"}) + * @param fileNames the file names requested + */ + public void logRequest(String operation, Collection fileNames) { + auditLogger.info("ts={} op={} files=[{}] principal={}", + Instant.now(), operation, String.join(",", fileNames), resolvePrincipal()); + } + + /** + * Logs a filter request that carries per-file filter conditions. + * + * @param operation human-readable operation label + * @param fileNames the file names requested + * @param hasFilters whether any filter conditions were supplied + */ + public void logRequest(String operation, Collection fileNames, boolean hasFilters) { + auditLogger.info("ts={} op={} files=[{}] filters={} principal={}", + Instant.now(), operation, String.join(",", fileNames), hasFilters, resolvePrincipal()); + } + + /** + * Logs a single-file request without filter conditions. + * + * @param operation human-readable operation label + * @param fileName the file name requested + */ + public void logRequest(String operation, String fileName) { + auditLogger.info("ts={} op={} files=[{}] principal={}", + Instant.now(), operation, fileName, resolvePrincipal()); + } + + /** + * Logs a single-file request with named filter conditions. + * + * @param operation human-readable operation label + * @param fileName the file name requested + * @param filters map of filter conditions (keys are logged; values are not) + */ + public void logRequest(String operation, String fileName, Map filters) { + boolean hasFilters = filters != null && !filters.isEmpty(); + auditLogger.info("ts={} op={} files=[{}] filters={} filterKeys=[{}] principal={}", + Instant.now(), operation, fileName, hasFilters, + hasFilters ? String.join(",", filters.keySet()) : "", + resolvePrincipal()); + } + + // ------------------------------------------------------------------------- + // Response-level audit entries (logged in the service after processing) + // ------------------------------------------------------------------------- + + /** + * Logs the outcome of processing a single file. + * + * @param operation human-readable operation label + * @param fileName the processed file name + * @param recordCount number of records in the result + * @param suppressedFeatures features suppressed by disclosure controls + */ + public void logResponse(String operation, String fileName, long recordCount, int suppressedFeatures) { + auditLogger.info("ts={} op={} file={} records={} suppressed={} principal={}", + Instant.now(), operation, fileName, recordCount, suppressedFeatures, resolvePrincipal()); + } + + // ------------------------------------------------------------------------- + // Helper + // ------------------------------------------------------------------------- + + /** + * Returns the name of the currently authenticated principal, or + * {@code "anonymous"} if no authentication is available (e.g. in an + * async thread where the {@link SecurityContextHolder} is not propagated). + */ + private String resolvePrincipal() { + try { + Authentication auth = SecurityContextHolder.getContext().getAuthentication(); + if (auth != null && auth.isAuthenticated() && !"anonymousUser".equals(auth.getName())) { + return auth.getName(); + } + } catch (Exception ignored) { + // Defensive: SecurityContextHolder may throw in certain async contexts + } + return "anonymous"; + } +} diff --git a/src/main/java/org/taniwha/service/AnalyticsService.java b/src/main/java/org/taniwha/service/AnalyticsService.java index e90251e..0798447 100644 --- a/src/main/java/org/taniwha/service/AnalyticsService.java +++ b/src/main/java/org/taniwha/service/AnalyticsService.java @@ -60,16 +60,22 @@ public class AnalyticsService { private final DataProcessingService dataProcessingService; private final FileService fileService; private final AnalyticsProcessingJobs jobs; + private final DisclosureControlService disclosureControl; + private final AnalyticsAuditService auditService; private final ExecutorService discoveryJobExecutor; private static final String SUCCESS_MSG = "Data processed successfully"; public AnalyticsService(DataProcessingService dataProcessingService, FileService fileService, - AnalyticsProcessingJobs jobs) { + AnalyticsProcessingJobs jobs, + DisclosureControlService disclosureControl, + AnalyticsAuditService auditService) { this.dataProcessingService = dataProcessingService; this.fileService = fileService; this.jobs = jobs; + this.disclosureControl = disclosureControl; + this.auditService = auditService; // Use a thread factory with meaningful thread names for debugging AtomicLong threadCounter = new AtomicLong(0); ThreadFactory threadFactory = r -> { @@ -349,6 +355,9 @@ private AnalyticsResponseDTO processSingleFileOnDiskWithProgress(String jobId, response.setSpearmanCorrelations(calculator.calculateSpearmanCorrelations(continuousData)); response.setChiSquareTest(calculator.calculateChiSquaredTest(categoricalData, comboCounts)); + int suppressed = disclosureControl.apply(response, totalRows); + auditService.logResponse("PROCESS", filename, totalRows, suppressed); + response.setMessage("File processed successfully: " + filename); return response; @@ -415,6 +424,9 @@ public CompletableFuture processSingleFileOnDisk(String fi response.setSpearmanCorrelations(calculator.calculateSpearmanCorrelations(continuousData)); response.setChiSquareTest(calculator.calculateChiSquaredTest(categoricalData, comboCounts)); + int suppressed = disclosureControl.apply(response, totalRows); + auditService.logResponse("PROCESS", filename, totalRows, suppressed); + response.setMessage("File processed successfully: " + filename); } catch (Exception e) { logger.error("Error processing file {}", filename, e); @@ -460,6 +472,8 @@ public CompletableFuture recalculateFeatureAsTypeFromDisk( return CompletableFuture.completedFuture(response); } processData(records, Optional.of(featureName), Optional.of(featureType), response, categoryCombinationCounts); + int suppressed = disclosureControl.apply(response, records.size()); + auditService.logResponse("REPROCESS", fileName, records.size(), suppressed); response.setMessage(SUCCESS_MSG); } catch (Exception e) { String errMsg = e.getMessage() != null ? e.getMessage() : "Unknown error"; @@ -763,6 +777,8 @@ public CompletableFuture filterDataByName(String fileName, } processData(records, Optional.empty(), Optional.empty(), response, categoryCombinationCounts); + int suppressed = disclosureControl.apply(response, records.size()); + auditService.logResponse("FILTER", fileName, records.size(), suppressed); response.setMessage(SUCCESS_MSG); } catch (Exception e) { logger.error("Error filtering file by name {}", fileName, e); @@ -830,7 +846,6 @@ private void processData(List> records, response.setCovariances(calculator.calculateCovariances(continuousData)); response.setPearsonCorrelations(calculator.calculatePearsonCorrelations(continuousData)); response.setSpearmanCorrelations(calculator.calculateSpearmanCorrelations(continuousData)); - response.setSpearmanCorrelations(calculator.calculateSpearmanCorrelations(continuousData)); response.setChiSquareTest(calculator.calculateChiSquaredTest(categoricalData, categoryCombinationCounts)); response.setMessage(SUCCESS_MSG); diff --git a/src/main/java/org/taniwha/service/DisclosureControlService.java b/src/main/java/org/taniwha/service/DisclosureControlService.java new file mode 100644 index 0000000..c49ac3a --- /dev/null +++ b/src/main/java/org/taniwha/service/DisclosureControlService.java @@ -0,0 +1,416 @@ +package org.taniwha.service; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.taniwha.dto.AnalyticsResponseDTO; +import org.taniwha.statistics.*; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * Applies privacy disclosure-control rules to every {@link AnalyticsResponseDTO} + * before it is sent to callers. + * + *

Controls applied: + *

    + *
  1. Global minimum-subset rule – if the record count is below + * {@code disclosure.min.subset.size} (default 3), the entire response is + * suppressed: all features are moved to {@code omittedFeatures} and all + * correlation/covariance matrices are cleared. The threshold is intentionally + * set low so that most real datasets pass through unaffected.
  2. + *
  3. Per-feature minimum-subset rule – individual features whose own + * non-missing count is below the threshold are moved to + * {@code omittedFeatures}.
  4. + *
  5. Categorical cell suppression (k-anonymity) – category cells with a + * frequency count strictly below {@code disclosure.min.cell.count} (default 2) + * are removed from the frequency map to avoid singling out individuals. + * The threshold is generous: a cell count of 1 (a unique individual) is the + * only value suppressed by default.
  6. + *
  7. Date histogram suppression – date-bucket entries with a count below + * the cell threshold are removed from the date histogram.
  8. + *
  9. Outlier-value suppression (k-anonymity for outliers) – raw outlier + * value lists are cleared only when the outlier count itself is below + * {@code minCellCount}. A very small outlier group (e.g. a single + * extreme value) can uniquely identify a patient; a larger group loses that + * identifying power and is kept intact so the analytics UI remains functional. + * Note: aggregate statistics (min, max, mean, IQR) are always returned + * unchanged.
  10. + *
  11. Correlation suppression – covariance, Pearson/Spearman correlation, + * and chi-squared matrices are cleared when the global minimum-subset rule + * fires.
  12. + *
+ * + *

All suppression events are logged at WARN level so that the audit trail in + * application logs captures every disclosure decision. + */ +@Service +public class DisclosureControlService { + + private static final Logger logger = LoggerFactory.getLogger(DisclosureControlService.class); + + private final int minSubsetSize; + private final int minCellCount; + + public DisclosureControlService( + @Value("${disclosure.min.subset.size:3}") int minSubsetSize, + @Value("${disclosure.min.cell.count:2}") int minCellCount) { + this.minSubsetSize = minSubsetSize; + this.minCellCount = minCellCount; + } + + public int getMinSubsetSize() { + return minSubsetSize; + } + + public int getMinCellCount() { + return minCellCount; + } + + /** + * Applies all disclosure-control rules to {@code response} in place. + * + * @param response the analytics response to sanitise + * @param totalRecords total number of records that were processed + * @return the number of features that were fully suppressed + */ + public int apply(AnalyticsResponseDTO response, long totalRecords) { + int suppressed = 0; + + if (totalRecords < minSubsetSize) { + suppressed += suppressAll(response, totalRecords); + return suppressed; + } + + suppressed += suppressSmallContinuousFeatures(response); + suppressed += suppressSmallCategoricalCells(response); + suppressed += suppressSmallDateFeatures(response); + suppressSmallContinuousOutlierGroups(response); + suppressSmallDateOutlierGroups(response); + + return suppressed; + } + + // ------------------------------------------------------------------------- + // Global suppression + // ------------------------------------------------------------------------- + + private int suppressAll(AnalyticsResponseDTO response, long totalRecords) { + String reason = String.format( + "Subset too small – minimum %d records required, only %d present", minSubsetSize, totalRecords); + + List omitted = new ArrayList<>( + safeList(response.getOmittedFeatures())); + int count = 0; + + count += moveFeaturesToOmitted(response.getContinuousFeatures(), omitted, reason); + count += moveFeaturesToOmitted(response.getCategoricalFeatures(), omitted, reason); + count += moveDateFeaturesToOmitted(response.getDateFeatures(), omitted, reason); + + response.setContinuousFeatures(List.of()); + response.setCategoricalFeatures(List.of()); + response.setDateFeatures(List.of()); + response.setOmittedFeatures(omitted); + + clearCorrelations(response); + + logger.warn("Full disclosure suppression applied: {} feature(s) suppressed (record count {} < minimum {})", + count, totalRecords, minSubsetSize); + return count; + } + + private int moveFeaturesToOmitted(List features, + List omitted, + String reason) { + if (features == null) return 0; + for (FeatureStatistics fs : features) { + omitted.add(new OmittedFeatureStatistics( + fs.getFeatureName(), fs.getCount(), + fs.getPercentMissing(), fs.getMissingValuesCount(), reason)); + } + return features.size(); + } + + private int moveDateFeaturesToOmitted(List features, + List omitted, + String reason) { + if (features == null) return 0; + for (DateFeatureStatistics fs : features) { + omitted.add(new OmittedFeatureStatistics( + fs.getFeatureName(), fs.getCount(), + fs.getPercentMissing(), fs.getMissingValuesCount(), reason)); + } + return features.size(); + } + + // ------------------------------------------------------------------------- + // Per-feature minimum-subset suppression + // ------------------------------------------------------------------------- + + private int suppressSmallContinuousFeatures(AnalyticsResponseDTO response) { + if (response.getContinuousFeatures() == null) return 0; + + String reason = String.format( + "Feature record count below minimum subset size (%d)", minSubsetSize); + List kept = new ArrayList<>(); + List omitted = new ArrayList<>(safeList(response.getOmittedFeatures())); + Set suppressedNames = new HashSet<>(); + int count = 0; + + for (FeatureStatistics fs : response.getContinuousFeatures()) { + if (fs.getCount() < minSubsetSize) { + omitted.add(new OmittedFeatureStatistics( + fs.getFeatureName(), fs.getCount(), + fs.getPercentMissing(), fs.getMissingValuesCount(), reason)); + suppressedNames.add(fs.getFeatureName()); + count++; + logger.warn("Suppressing continuous feature '{}': count {} below minimum {}", + fs.getFeatureName(), fs.getCount(), minSubsetSize); + } else { + kept.add(fs); + } + } + + if (!suppressedNames.isEmpty()) { + removeFromCorrelationMatrix(response.getCovariances(), suppressedNames); + removeFromCorrelationMatrix(response.getPearsonCorrelations(), suppressedNames); + removeFromCorrelationMatrix(response.getSpearmanCorrelations(), suppressedNames); + } + + response.setContinuousFeatures(kept); + response.setOmittedFeatures(omitted); + return count; + } + + private int suppressSmallDateFeatures(AnalyticsResponseDTO response) { + if (response.getDateFeatures() == null) return 0; + + String reason = String.format( + "Feature record count below minimum subset size (%d)", minSubsetSize); + List kept = new ArrayList<>(); + List omitted = new ArrayList<>(safeList(response.getOmittedFeatures())); + int count = 0; + + for (DateFeatureStatistics fs : response.getDateFeatures()) { + if (fs.getCount() < minSubsetSize) { + omitted.add(new OmittedFeatureStatistics( + fs.getFeatureName(), fs.getCount(), + fs.getPercentMissing(), fs.getMissingValuesCount(), reason)); + count++; + logger.warn("Suppressing date feature '{}': count {} below minimum {}", + fs.getFeatureName(), fs.getCount(), minSubsetSize); + } else { + suppressSmallDateHistogramCells(fs); + kept.add(fs); + } + } + + response.setDateFeatures(kept); + response.setOmittedFeatures(omitted); + return count; + } + + // ------------------------------------------------------------------------- + // Categorical cell suppression + // ------------------------------------------------------------------------- + + private int suppressSmallCategoricalCells(AnalyticsResponseDTO response) { + if (response.getCategoricalFeatures() == null) return 0; + + String suppressedReason = String.format( + "All categories below minimum cell count (%d)", minCellCount); + List kept = new ArrayList<>(); + List omitted = new ArrayList<>(safeList(response.getOmittedFeatures())); + Set suppressedNames = new HashSet<>(); + int suppressedFeatures = 0; + + for (FeatureStatistics fs : response.getCategoricalFeatures()) { + if (!(fs instanceof CategoricalFeatureStatistics cfs)) { + kept.add(fs); + continue; + } + + Map counts = cfs.getCategoryCounts(); + int before = counts.size(); + counts.entrySet().removeIf(e -> e.getValue() < minCellCount); + int removed = before - counts.size(); + + if (removed > 0) { + logger.warn("Suppressed {} small cell(s) in categorical feature '{}' (threshold {})", + removed, cfs.getFeatureName(), minCellCount); + } + + if (counts.isEmpty()) { + omitted.add(new OmittedFeatureStatistics( + cfs.getFeatureName(), cfs.getCount(), + cfs.getPercentMissing(), cfs.getMissingValuesCount(), suppressedReason)); + suppressedNames.add(cfs.getFeatureName()); + suppressedFeatures++; + logger.warn("Suppressing categorical feature '{}': all cells below minimum count {}", + cfs.getFeatureName(), minCellCount); + } else if (removed > 0) { + kept.add(rebuildCategoricalFeature(cfs, counts)); + } else { + kept.add(cfs); + } + } + + if (!suppressedNames.isEmpty() && response.getChiSquareTest() != null) { + List filteredChi = response.getChiSquareTest().stream() + .filter(r -> !suppressedNames.contains(r.getCategory1()) + && !suppressedNames.contains(r.getCategory2())) + .collect(Collectors.toList()); + response.setChiSquareTest(filteredChi); + } + + response.setCategoricalFeatures(kept); + response.setOmittedFeatures(omitted); + return suppressedFeatures; + } + + /** + * Rebuilds a {@link CategoricalFeatureStatistics} after cells have been + * removed from its frequency map so that derived fields (mode, cardinality) + * remain consistent. + */ + private CategoricalFeatureStatistics rebuildCategoricalFeature(CategoricalFeatureStatistics original, + Map filteredCounts) { + List> sorted = filteredCounts.entrySet().stream() + .sorted((a, b) -> b.getValue().compareTo(a.getValue())) + .collect(Collectors.toList()); + + Map.Entry modeEntry = sorted.get(0); + String mode = modeEntry.getKey(); + int modeFreq = modeEntry.getValue(); + double modePercent = (double) modeFreq / original.getCount() * 100; + + String secondMode = sorted.size() > 1 ? sorted.get(1).getKey() : null; + Integer secondModeFreq = secondMode != null ? filteredCounts.get(secondMode) : null; + Double secondModePercent = secondModeFreq != null + ? (double) secondModeFreq / original.getCount() * 100 : null; + + return new CategoricalFeatureStatistics( + original.getFeatureName(), original.getCount(), + original.getPercentMissing(), original.getMissingValuesCount(), + filteredCounts.size(), + mode, modeFreq, modePercent, + secondMode, secondModeFreq, secondModePercent, + filteredCounts); + } + + // ------------------------------------------------------------------------- + // Date histogram cell suppression + // ------------------------------------------------------------------------- + + private void suppressSmallDateHistogramCells(DateFeatureStatistics fs) { + Map histogram = fs.getDateHistogram(); + if (histogram == null) return; + int before = histogram.size(); + histogram.entrySet().removeIf(e -> e.getValue() < minCellCount); + int removed = before - histogram.size(); + if (removed > 0) { + logger.warn("Suppressed {} small date histogram bucket(s) in feature '{}' (threshold {})", + removed, fs.getFeatureName(), minCellCount); + } + } + + // ------------------------------------------------------------------------- + // Outlier-value suppression (k-anonymity) + // ------------------------------------------------------------------------- + + /** + * Suppresses the raw numeric outlier list only when the number of outlier + * values is smaller than {@code minCellCount} – a tiny group (e.g. a single + * extreme measurement) can uniquely identify an individual. Larger groups + * are left intact because they no longer pinpoint specific records, and the + * analytics UI depends on the values for visualisation. + * + *

Note: aggregate statistics (min, max, mean, IQR) are not modified. + */ + private void suppressSmallContinuousOutlierGroups(AnalyticsResponseDTO response) { + if (response.getContinuousFeatures() == null) return; + + List sanitised = new ArrayList<>(); + for (FeatureStatistics fs : response.getContinuousFeatures()) { + if (fs instanceof ContinuousFeatureStatistics cfs + && !cfs.getOutliers().isEmpty() + && cfs.getOutliers().size() < minCellCount) { + logger.warn( + "Suppressing {} outlier value(s) for continuous feature '{}': group size < {}", + cfs.getOutliers().size(), cfs.getFeatureName(), minCellCount); + sanitised.add(new ContinuousFeatureStatistics( + cfs.getFeatureName(), cfs.getCount(), + cfs.getPercentMissing(), cfs.getMissingValuesCount(), + cfs.getCardinality(), + cfs.getMin(), cfs.getMax(), cfs.getMean(), cfs.getStdDev(), + cfs.getQrt1(), cfs.getMedian(), cfs.getQrt3(), + cfs.getHistogram(), cfs.getBinRanges(), + List.of())); + } else { + sanitised.add(fs); + } + } + response.setContinuousFeatures(sanitised); + } + + /** + * Suppresses the raw date outlier list only when the number of outlier + * dates is smaller than {@code minCellCount}. + */ + private void suppressSmallDateOutlierGroups(AnalyticsResponseDTO response) { + if (response.getDateFeatures() == null) return; + + List sanitised = new ArrayList<>(); + for (DateFeatureStatistics fs : response.getDateFeatures()) { + if (!fs.getOutliers().isEmpty() && fs.getOutliers().size() < minCellCount) { + logger.warn( + "Suppressing {} outlier date(s) for date feature '{}': group size < {}", + fs.getOutliers().size(), fs.getFeatureName(), minCellCount); + sanitised.add(new DateFeatureStatistics( + fs.getFeatureName(), fs.getCount(), + fs.getPercentMissing(), fs.getMissingValuesCount(), + fs.getEarliestDate(), fs.getLatestDate(), + fs.getDateHistogram(), + List.of(), + fs.getMean(), fs.getStdDev(), + fs.getMedian(), fs.getQ1(), fs.getQ3())); + } else { + sanitised.add(fs); + } + } + response.setDateFeatures(sanitised); + } + + // ------------------------------------------------------------------------- + // Correlation suppression + // ------------------------------------------------------------------------- + + private void clearCorrelations(AnalyticsResponseDTO response) { + response.setCovariances(Map.of()); + response.setPearsonCorrelations(Map.of()); + response.setSpearmanCorrelations(Map.of()); + response.setChiSquareTest(List.of()); + } + + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + + /** + * Removes every entry whose key is in {@code featureNames} from the outer map, + * and also purges those names from every inner map (since the pairwise relation + * is stored symmetrically under both feature keys). + */ + private void removeFromCorrelationMatrix(Map> matrix, + Set featureNames) { + if (matrix == null) return; + featureNames.forEach(matrix::remove); + matrix.values().forEach(inner -> inner.keySet().removeAll(featureNames)); + } + + private List safeList(List list) { + return list != null ? list : List.of(); + } +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 55a6816..fbb935a 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -31,9 +31,17 @@ jwt.expiration=10800 kerberos.ticket.version=5 kerberos.realm=TANIWHA-REALM # Logger config -logging.config=classpath:logback.xml +# Note: logging.config is intentionally omitted so that logback-test.xml takes precedence +# during test runs (Logback's standard auto-discovery) while logback.xml is loaded in production. logging.level.org.taniwha=DEBUG tls.probe.enabled=true # tls.probe.host=semantics.inf.um.es # tls.probe.port=443 + +# Disclosure control – privacy thresholds for analytics outputs +# Minimum number of records required to return feature statistics (generous default) +disclosure.min.subset.size=${DISCLOSURE_MIN_SUBSET_SIZE:3} +# Minimum cell count for categorical/date histogram buckets (k-anonymity). +# Cells with a count of exactly 1 (a unique individual) are suppressed by default. +disclosure.min.cell.count=${DISCLOSURE_MIN_CELL_COUNT:2} diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml index 460020a..c33fe35 100644 --- a/src/main/resources/logback.xml +++ b/src/main/resources/logback.xml @@ -5,6 +5,24 @@ + + + + ${APP_PATH:-/taniwha}/logs/audit.log + + ${APP_PATH:-/taniwha}/logs/audit.%d{yyyy-MM-dd}.log + 30 + + + %d{yyyy-MM-dd HH:mm:ss} %msg%n + + + + + + + + diff --git a/src/test/java/org/taniwha/controller/AnalyticsControllerTest.java b/src/test/java/org/taniwha/controller/AnalyticsControllerTest.java index fde6067..fffdf28 100644 --- a/src/test/java/org/taniwha/controller/AnalyticsControllerTest.java +++ b/src/test/java/org/taniwha/controller/AnalyticsControllerTest.java @@ -8,6 +8,7 @@ import org.springframework.test.web.servlet.setup.MockMvcBuilders; import org.taniwha.dto.*; import org.taniwha.service.jobs.AnalyticsProcessingJobs; +import org.taniwha.service.AnalyticsAuditService; import org.taniwha.service.AnalyticsService; import java.util.Collections; @@ -21,20 +22,24 @@ import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*; +import org.mockito.Mockito; + class AnalyticsControllerTest { private MockMvc mvc; private AnalyticsService analyticsService; private AnalyticsProcessingJobs jobs; + private AnalyticsAuditService auditService; private final ObjectMapper om = new ObjectMapper(); @BeforeEach void setUp() { analyticsService = mock(AnalyticsService.class); jobs = mock(AnalyticsProcessingJobs.class); + auditService = mock(AnalyticsAuditService.class); mvc = MockMvcBuilders - .standaloneSetup(new AnalyticsController(analyticsService, jobs)) + .standaloneSetup(new AnalyticsController(analyticsService, jobs, auditService)) .build(); } @@ -181,4 +186,138 @@ void filterByNameList_serviceThrows_returns500AndErrorMessage() throws Exception .andExpect(status().isInternalServerError()) .andExpect(jsonPath("$[0].message").value("Error filtering multiple files: boom")); } + + // ------------------------------------------------------------------------- + // processList – huge-job path (returns 202 + jobId) + // ------------------------------------------------------------------------- + + @Test + void processList_hugeFiles_returns202WithJobId() throws Exception { + FileNamesDTO reqDto = new FileNamesDTO(); + reqDto.setFileNames(List.of("huge.csv")); + + when(analyticsService.isAnyHugeForDiscovery(reqDto.getFileNames())).thenReturn(true); + when(jobs.createJob()).thenReturn("job-abc"); + + mvc.perform(post("/api/data/processList") + .contentType(MediaType.APPLICATION_JSON) + .content(om.writeValueAsString(reqDto))) + .andExpect(status().isAccepted()) + .andExpect(jsonPath("$.jobId").value("job-abc")) + .andExpect(jsonPath("$.progress").value(true)); + + verify(analyticsService).startDiscoveryJob("job-abc", List.of("huge.csv")); + } + + // ------------------------------------------------------------------------- + // processListStatus – found job + // ------------------------------------------------------------------------- + + @Test + void processListStatus_knownJob_returns200WithDto() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + ProcessingStatusDTO dto = new ProcessingStatusDTO("job-1", ProcessingStatusDTO.State.RUNNING, 50, "f.csv", null, null); + when(jobs.getJob("job-1")).thenReturn(s); + when(jobs.toDto(s, false)).thenReturn(dto); + + mvc.perform(get("/api/data/processList/status/job-1")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.jobId").value("job-1")) + .andExpect(jsonPath("$.state").value("RUNNING")) + .andExpect(jsonPath("$.percent").value(50)); + } + + // ------------------------------------------------------------------------- + // cancelProcessList + // ------------------------------------------------------------------------- + + @Test + void cancelProcessList_knownJob_returns200() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + ProcessingStatusDTO dto = new ProcessingStatusDTO("job-2", ProcessingStatusDTO.State.CANCELED, 30, null, "Canceled", null); + when(jobs.getJob("job-2")).thenReturn(s); + when(jobs.toDto(s, false)).thenReturn(dto); + + mvc.perform(post("/api/data/processList/cancel/job-2")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.state").value("CANCELED")); + + verify(jobs).cancel("job-2", "Discovery canceled because the user left the page"); + } + + @Test + void cancelProcessList_unknownJob_returns404() throws Exception { + when(jobs.getJob("unknown")).thenReturn(null); + + mvc.perform(post("/api/data/processList/cancel/unknown")) + .andExpect(status().isNotFound()); + + verify(jobs, never()).cancel(any(), any()); + } + + // ------------------------------------------------------------------------- + // processListResult + // ------------------------------------------------------------------------- + + @Test + void processListResult_unknownJob_returns404() throws Exception { + when(jobs.getJob("j")).thenReturn(null); + + mvc.perform(get("/api/data/processList/result/j")) + .andExpect(status().isNotFound()); + } + + @Test + void processListResult_canceledJob_returns410AndClearsJob() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + when(s.getState()).thenReturn(ProcessingStatusDTO.State.CANCELED); + when(jobs.getJob("j-canceled")).thenReturn(s); + + mvc.perform(get("/api/data/processList/result/j-canceled")) + .andExpect(status().isGone()); + + verify(jobs).clear("j-canceled"); + } + + @Test + void processListResult_jobStillRunning_returns409() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + when(s.getState()).thenReturn(ProcessingStatusDTO.State.RUNNING); + when(jobs.getJob("j-running")).thenReturn(s); + + mvc.perform(get("/api/data/processList/result/j-running")) + .andExpect(status().isConflict()); + + verify(jobs, never()).clear(any()); + } + + @Test + void processListResult_doneJobWithResults_returns200AndClearsJob() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + when(s.getState()).thenReturn(ProcessingStatusDTO.State.DONE); + List results = List.of(new AnalyticsResponseDTO("r1")); + when(s.getResults()).thenReturn(results); + when(jobs.getJob("j-done")).thenReturn(s); + + mvc.perform(get("/api/data/processList/result/j-done")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$[0].message").value("r1")); + + verify(jobs).clear("j-done"); + } + + @Test + void processListResult_doneJobNullResults_returnsEmptyList() throws Exception { + AnalyticsProcessingJobs.JobState s = mock(AnalyticsProcessingJobs.JobState.class); + when(s.getState()).thenReturn(ProcessingStatusDTO.State.DONE); + when(s.getResults()).thenReturn(null); + when(jobs.getJob("j-null")).thenReturn(s); + + mvc.perform(get("/api/data/processList/result/j-null")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$").isArray()) + .andExpect(jsonPath("$").isEmpty()); + + verify(jobs).clear("j-null"); + } } diff --git a/src/test/java/org/taniwha/controller/FileCleaningControllerTest.java b/src/test/java/org/taniwha/controller/FileCleaningControllerTest.java new file mode 100644 index 0000000..b30b76e --- /dev/null +++ b/src/test/java/org/taniwha/controller/FileCleaningControllerTest.java @@ -0,0 +1,228 @@ +package org.taniwha.controller; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.http.MediaType; +import org.springframework.test.web.servlet.MockMvc; +import org.springframework.test.web.servlet.setup.MockMvcBuilders; +import org.taniwha.dto.DataCleaningOptionsDTO; +import org.taniwha.model.FileCategory; +import org.taniwha.service.DataCleaningService; +import org.taniwha.service.jobs.CleaningProcessingJobs; + +import static org.hamcrest.Matchers.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.*; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.*; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*; + +class FileCleaningControllerTest { + + private MockMvc mvc; + private DataCleaningService dataCleaningService; + private CleaningProcessingJobs cleaningJobs; + private final ObjectMapper om = new ObjectMapper(); + + @BeforeEach + void setUp() { + dataCleaningService = mock(DataCleaningService.class); + cleaningJobs = mock(CleaningProcessingJobs.class); + mvc = MockMvcBuilders + .standaloneSetup(new FileCleaningController(dataCleaningService, cleaningJobs)) + .build(); + } + + // ------------------------------------------------------------------------- + // POST /api/files/clean + // ------------------------------------------------------------------------- + + @Test + void cleanFile_invokesServiceAndReturns200() throws Exception { + mvc.perform(post("/api/files/clean") + .param("category", "DATASETS") + .param("name", "data.csv")) + .andExpect(status().isOk()); + + verify(dataCleaningService).cleanInPlace(eq(FileCategory.DATASETS), eq("data.csv"), isNull()); + } + + @Test + void cleanFile_withOptions_passesOptionsToService() throws Exception { + DataCleaningOptionsDTO opts = new DataCleaningOptionsDTO(); + mvc.perform(post("/api/files/clean") + .param("category", "DATASETS") + .param("name", "data.csv") + .contentType(MediaType.APPLICATION_JSON) + .content(om.writeValueAsString(opts))) + .andExpect(status().isOk()); + + verify(dataCleaningService).cleanInPlace(eq(FileCategory.DATASETS), eq("data.csv"), any(DataCleaningOptionsDTO.class)); + } + + // ------------------------------------------------------------------------- + // POST /api/files/clean/start + // ------------------------------------------------------------------------- + + @Test + void startCleanFile_returns202WithJobId() throws Exception { + when(cleaningJobs.createJob()).thenReturn("job-1"); + + mvc.perform(post("/api/files/clean/start") + .param("category", "DATASETS") + .param("name", "data.csv")) + .andExpect(status().isAccepted()) + .andExpect(jsonPath("$.jobId").value("job-1")) + .andExpect(jsonPath("$.accepted").value(true)); + + verify(dataCleaningService).startCleanJob(eq("job-1"), eq(FileCategory.DATASETS), eq("data.csv"), isNull()); + } + + @Test + void startCleanFile_withOptions_passesOptionsToService() throws Exception { + when(cleaningJobs.createJob()).thenReturn("job-2"); + DataCleaningOptionsDTO opts = new DataCleaningOptionsDTO(); + + mvc.perform(post("/api/files/clean/start") + .param("category", "FHIR_MAPPINGS") + .param("name", "map.json") + .contentType(MediaType.APPLICATION_JSON) + .content(om.writeValueAsString(opts))) + .andExpect(status().isAccepted()) + .andExpect(jsonPath("$.jobId").value("job-2")); + + verify(dataCleaningService).startCleanJob(eq("job-2"), eq(FileCategory.FHIR_MAPPINGS), eq("map.json"), any(DataCleaningOptionsDTO.class)); + } + + // ------------------------------------------------------------------------- + // GET /api/files/clean/status/{jobId} + // ------------------------------------------------------------------------- + + @Test + void getCleanStatus_unknownJob_returns404() throws Exception { + when(cleaningJobs.getJob("missing")).thenReturn(null); + + mvc.perform(get("/api/files/clean/status/missing")) + .andExpect(status().isNotFound()); + } + + @Test + void getCleanStatus_knownJob_returns200WithFields() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-3", CleaningProcessingJobs.State.RUNNING, 42, "current.csv", "Working"); + when(cleaningJobs.getJob("job-3")).thenReturn(state); + + mvc.perform(get("/api/files/clean/status/job-3")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.jobId").value("job-3")) + .andExpect(jsonPath("$.state").value("RUNNING")) + .andExpect(jsonPath("$.percent").value(42)) + .andExpect(jsonPath("$.currentFile").value("current.csv")) + .andExpect(jsonPath("$.message").value("Working")); + } + + @Test + void getCleanStatus_nullCurrentFileAndMessage_returnsEmptyStrings() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-4", CleaningProcessingJobs.State.DONE, 100, null, null); + when(cleaningJobs.getJob("job-4")).thenReturn(state); + + mvc.perform(get("/api/files/clean/status/job-4")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.currentFile").value("")) + .andExpect(jsonPath("$.message").value("")); + } + + // ------------------------------------------------------------------------- + // GET /api/files/clean/result/{jobId} + // ------------------------------------------------------------------------- + + @Test + void getCleanResult_unknownJob_returns404() throws Exception { + when(cleaningJobs.getJob("gone")).thenReturn(null); + + mvc.perform(get("/api/files/clean/result/gone")) + .andExpect(status().isNotFound()); + } + + @Test + void getCleanResult_jobInErrorState_returns500AndClearsJob() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-5", CleaningProcessingJobs.State.ERROR, 0, null, "Something went wrong"); + when(cleaningJobs.getJob("job-5")).thenReturn(state); + + mvc.perform(get("/api/files/clean/result/job-5")) + .andExpect(status().isInternalServerError()) + .andExpect(jsonPath("$.message").value("Something went wrong")); + + verify(cleaningJobs).clear("job-5"); + } + + @Test + void getCleanResult_errorStateNullMessage_usesDefaultMessage() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-6", CleaningProcessingJobs.State.ERROR, 0, null, null); + when(cleaningJobs.getJob("job-6")).thenReturn(state); + + mvc.perform(get("/api/files/clean/result/job-6")) + .andExpect(status().isInternalServerError()) + .andExpect(jsonPath("$.message").value("Cleaning failed.")); + + verify(cleaningJobs).clear("job-6"); + } + + @Test + void getCleanResult_jobStillRunning_returns409() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-7", CleaningProcessingJobs.State.RUNNING, 50, null, null); + when(cleaningJobs.getJob("job-7")).thenReturn(state); + + mvc.perform(get("/api/files/clean/result/job-7")) + .andExpect(status().isConflict()) + .andExpect(jsonPath("$.message").value("Cleaning job is not finished yet.")); + + verify(cleaningJobs, never()).clear(any()); + } + + @Test + void getCleanResult_jobDoneWithResult_returns200AndClearsJob() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-8", CleaningProcessingJobs.State.DONE, 100, null, null); + when(state.getResult()).thenReturn("All clean!"); + when(cleaningJobs.getJob("job-8")).thenReturn(state); + + mvc.perform(get("/api/files/clean/result/job-8")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.message").value("All clean!")); + + verify(cleaningJobs).clear("job-8"); + } + + @Test + void getCleanResult_jobDoneNullResult_usesDefaultMessage() throws Exception { + CleaningProcessingJobs.JobState state = makeJobState("job-9", CleaningProcessingJobs.State.DONE, 100, null, null); + // result is null + when(cleaningJobs.getJob("job-9")).thenReturn(state); + + mvc.perform(get("/api/files/clean/result/job-9")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.message").value("Cleaning completed successfully.")); + + verify(cleaningJobs).clear("job-9"); + } + + // ------------------------------------------------------------------------- + // Helper + // ------------------------------------------------------------------------- + + private CleaningProcessingJobs.JobState makeJobState(String jobId, + CleaningProcessingJobs.State state, + int percent, + String currentFile, + String message) { + CleaningProcessingJobs.JobState js = mock(CleaningProcessingJobs.JobState.class); + java.util.concurrent.atomic.AtomicInteger pct = new java.util.concurrent.atomic.AtomicInteger(percent); + when(js.getJobId()).thenReturn(jobId); + when(js.getState()).thenReturn(state); + when(js.getPercent()).thenReturn(pct); + when(js.getCurrentFile()).thenReturn(currentFile); + when(js.getMessage()).thenReturn(message); + when(js.getResult()).thenReturn(null); + return js; + } +} diff --git a/src/test/java/org/taniwha/controller/FileControllerTest.java b/src/test/java/org/taniwha/controller/FileControllerTest.java index 830741b..927c6b9 100644 --- a/src/test/java/org/taniwha/controller/FileControllerTest.java +++ b/src/test/java/org/taniwha/controller/FileControllerTest.java @@ -5,6 +5,8 @@ import org.springframework.http.MediaType; import org.springframework.test.web.servlet.MockMvc; import org.springframework.test.web.servlet.setup.MockMvcBuilders; +import org.taniwha.dto.FileInfoDto; +import org.taniwha.model.FileCategory; import org.taniwha.service.FileService; import java.nio.file.Files; @@ -175,4 +177,62 @@ void getElementFile_badPath_throwsReading_returns500() throws Exception { .andExpect(content() .string("Error fetching element file: bad.txt")); } + + // ------------------------------------------------------------------------- + // GET /api/files?category=DATASETS (listFiles) + // ------------------------------------------------------------------------- + + @Test + void listFiles_returnsInfoDtoList() throws Exception { + List infos = List.of( + new FileInfoDto("a.csv", 100L, 1000L, 2000L), + new FileInfoDto("b.csv", 200L, 3000L, 4000L) + ); + when(fileService.listFilesWithInfo(FileCategory.DATASETS)).thenReturn(infos); + + mvc.perform(get("/api/files").param("category", "DATASETS")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$[0].name").value("a.csv")) + .andExpect(jsonPath("$[0].sizeBytes").value(100)) + .andExpect(jsonPath("$[1].name").value("b.csv")); + } + + @Test + void listFiles_emptyResult_returns200WithEmptyArray() throws Exception { + when(fileService.listFilesWithInfo(FileCategory.FHIR_MAPPINGS)).thenReturn(List.of()); + + mvc.perform(get("/api/files").param("category", "FHIR_MAPPINGS")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$").isArray()) + .andExpect(jsonPath("$").isEmpty()); + } + + // ------------------------------------------------------------------------- + // POST /api/files/rename + // ------------------------------------------------------------------------- + + @Test + void renameFile_success_returns200() throws Exception { + mvc.perform(post("/api/files/rename") + .param("category", "DATASETS") + .param("from", "old.csv") + .param("to", "new.csv")) + .andExpect(status().isOk()); + + verify(fileService).renameFile(FileCategory.DATASETS, "old.csv", "new.csv"); + } + + // ------------------------------------------------------------------------- + // DELETE /api/files?category=DATASETS&name=x.csv + // ------------------------------------------------------------------------- + + @Test + void deleteFile_success_returns200() throws Exception { + mvc.perform(delete("/api/files") + .param("category", "DATASETS") + .param("name", "del.csv")) + .andExpect(status().isOk()); + + verify(fileService).deleteFile(FileCategory.DATASETS, "del.csv"); + } } diff --git a/src/test/java/org/taniwha/dto/CleaningStartDTOTest.java b/src/test/java/org/taniwha/dto/CleaningStartDTOTest.java new file mode 100644 index 0000000..8a69b57 --- /dev/null +++ b/src/test/java/org/taniwha/dto/CleaningStartDTOTest.java @@ -0,0 +1,24 @@ +package org.taniwha.dto; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class CleaningStartDTOTest { + + @Test + void constructor_setsAllFields() { + CleaningStartDTO dto = new CleaningStartDTO("job-abc", true); + + assertThat(dto.getJobId()).isEqualTo("job-abc"); + assertThat(dto.isAccepted()).isTrue(); + } + + @Test + void constructor_withFalseAccepted() { + CleaningStartDTO dto = new CleaningStartDTO("job-xyz", false); + + assertThat(dto.getJobId()).isEqualTo("job-xyz"); + assertThat(dto.isAccepted()).isFalse(); + } +} diff --git a/src/test/java/org/taniwha/dto/CleaningStatusDTOTest.java b/src/test/java/org/taniwha/dto/CleaningStatusDTOTest.java new file mode 100644 index 0000000..ac61c32 --- /dev/null +++ b/src/test/java/org/taniwha/dto/CleaningStatusDTOTest.java @@ -0,0 +1,50 @@ +package org.taniwha.dto; + +import org.junit.jupiter.api.Test; +import org.taniwha.service.jobs.CleaningProcessingJobs; + +import static org.assertj.core.api.Assertions.assertThat; + +class CleaningStatusDTOTest { + + @Test + void constructor_setsAllFields() { + CleaningStatusDTO dto = new CleaningStatusDTO( + "job-1", CleaningProcessingJobs.State.RUNNING, 42, "file.csv", "Working…", "ok"); + + assertThat(dto.getJobId()).isEqualTo("job-1"); + assertThat(dto.getState()).isEqualTo(CleaningProcessingJobs.State.RUNNING); + assertThat(dto.getPercent()).isEqualTo(42); + assertThat(dto.getCurrentFile()).isEqualTo("file.csv"); + assertThat(dto.getMessage()).isEqualTo("Working…"); + assertThat(dto.getResult()).isEqualTo("ok"); + } + + @Test + void setters_changeFieldValues() { + CleaningStatusDTO dto = new CleaningStatusDTO( + "j0", CleaningProcessingJobs.State.RUNNING, 0, "", "", ""); + + dto.setJobId("job-2"); + dto.setState(CleaningProcessingJobs.State.DONE); + dto.setPercent(100); + dto.setCurrentFile("out.csv"); + dto.setMessage("Done"); + dto.setResult("Cleaning completed successfully."); + + assertThat(dto.getJobId()).isEqualTo("job-2"); + assertThat(dto.getState()).isEqualTo(CleaningProcessingJobs.State.DONE); + assertThat(dto.getPercent()).isEqualTo(100); + assertThat(dto.getCurrentFile()).isEqualTo("out.csv"); + assertThat(dto.getMessage()).isEqualTo("Done"); + assertThat(dto.getResult()).isEqualTo("Cleaning completed successfully."); + } + + @Test + void stateValues_areAllAccessible() { + assertThat(CleaningProcessingJobs.State.values()).containsExactlyInAnyOrder( + CleaningProcessingJobs.State.RUNNING, + CleaningProcessingJobs.State.DONE, + CleaningProcessingJobs.State.ERROR); + } +} diff --git a/src/test/java/org/taniwha/dto/HarmonizationStartDTOTest.java b/src/test/java/org/taniwha/dto/HarmonizationStartDTOTest.java new file mode 100644 index 0000000..2111432 --- /dev/null +++ b/src/test/java/org/taniwha/dto/HarmonizationStartDTOTest.java @@ -0,0 +1,31 @@ +package org.taniwha.dto; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class HarmonizationStartDTOTest { + + @Test + void noArgsConstructor_createsDefaultDto() { + HarmonizationStartDTO dto = new HarmonizationStartDTO(); + assertThat(dto.getJobId()).isNull(); + assertThat(dto.isProgress()).isFalse(); + } + + @Test + void allArgsConstructor_setsAllFields() { + HarmonizationStartDTO dto = new HarmonizationStartDTO("job-42", true); + assertThat(dto.getJobId()).isEqualTo("job-42"); + assertThat(dto.isProgress()).isTrue(); + } + + @Test + void setters_changeFields() { + HarmonizationStartDTO dto = new HarmonizationStartDTO(); + dto.setJobId("j-99"); + dto.setProgress(true); + assertThat(dto.getJobId()).isEqualTo("j-99"); + assertThat(dto.isProgress()).isTrue(); + } +} diff --git a/src/test/java/org/taniwha/dto/HarmonizationStatusDTOTest.java b/src/test/java/org/taniwha/dto/HarmonizationStatusDTOTest.java new file mode 100644 index 0000000..4cb13fb --- /dev/null +++ b/src/test/java/org/taniwha/dto/HarmonizationStatusDTOTest.java @@ -0,0 +1,66 @@ +package org.taniwha.dto; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class HarmonizationStatusDTOTest { + + @Test + void noArgsConstructor_createsEmptyDto() { + HarmonizationStatusDTO dto = new HarmonizationStatusDTO(); + assertThat(dto.getJobId()).isNull(); + assertThat(dto.getState()).isNull(); + assertThat(dto.getPercent()).isEqualTo(0); + assertThat(dto.getCurrentDataset()).isNull(); + assertThat(dto.getMessage()).isNull(); + assertThat(dto.getResult()).isNull(); + } + + @Test + void allArgsConstructor_setsAllFields() { + HarmonizationStatusDTO dto = new HarmonizationStatusDTO( + "j1", HarmonizationStatusDTO.State.RUNNING, 55, "data.csv", "Processing", "result"); + + assertThat(dto.getJobId()).isEqualTo("j1"); + assertThat(dto.getState()).isEqualTo(HarmonizationStatusDTO.State.RUNNING); + assertThat(dto.getPercent()).isEqualTo(55); + assertThat(dto.getCurrentDataset()).isEqualTo("data.csv"); + assertThat(dto.getMessage()).isEqualTo("Processing"); + assertThat(dto.getResult()).isEqualTo("result"); + } + + @Test + void setters_changeFieldValues() { + HarmonizationStatusDTO dto = new HarmonizationStatusDTO(); + + dto.setJobId("j2"); + dto.setState(HarmonizationStatusDTO.State.DONE); + dto.setPercent(100); + dto.setCurrentDataset("parsed.csv"); + dto.setMessage("Finished"); + dto.setResult("Files processed successfully."); + + assertThat(dto.getJobId()).isEqualTo("j2"); + assertThat(dto.getState()).isEqualTo(HarmonizationStatusDTO.State.DONE); + assertThat(dto.getPercent()).isEqualTo(100); + assertThat(dto.getCurrentDataset()).isEqualTo("parsed.csv"); + assertThat(dto.getMessage()).isEqualTo("Finished"); + assertThat(dto.getResult()).isEqualTo("Files processed successfully."); + } + + @Test + void stateEnum_containsAllValues() { + assertThat(HarmonizationStatusDTO.State.values()).containsExactlyInAnyOrder( + HarmonizationStatusDTO.State.RUNNING, + HarmonizationStatusDTO.State.DONE, + HarmonizationStatusDTO.State.ERROR); + } + + @Test + void setState_errorState() { + HarmonizationStatusDTO dto = new HarmonizationStatusDTO(); + dto.setState(HarmonizationStatusDTO.State.ERROR); + assertThat(dto.getState()).isEqualTo(HarmonizationStatusDTO.State.ERROR); + } +} diff --git a/src/test/java/org/taniwha/dto/ProcessingStartDTOTest.java b/src/test/java/org/taniwha/dto/ProcessingStartDTOTest.java new file mode 100644 index 0000000..73277a2 --- /dev/null +++ b/src/test/java/org/taniwha/dto/ProcessingStartDTOTest.java @@ -0,0 +1,31 @@ +package org.taniwha.dto; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class ProcessingStartDTOTest { + + @Test + void noArgsConstructor_createsDefaultDto() { + ProcessingStartDTO dto = new ProcessingStartDTO(); + assertThat(dto.getJobId()).isNull(); + assertThat(dto.isProgress()).isFalse(); + } + + @Test + void allArgsConstructor_setsAllFields() { + ProcessingStartDTO dto = new ProcessingStartDTO("job-1", true); + assertThat(dto.getJobId()).isEqualTo("job-1"); + assertThat(dto.isProgress()).isTrue(); + } + + @Test + void setters_changeFields() { + ProcessingStartDTO dto = new ProcessingStartDTO(); + dto.setJobId("proc-job"); + dto.setProgress(false); + assertThat(dto.getJobId()).isEqualTo("proc-job"); + assertThat(dto.isProgress()).isFalse(); + } +} diff --git a/src/test/java/org/taniwha/security/FileFilterTest.java b/src/test/java/org/taniwha/security/FileFilterTest.java index a5e350f..411c21c 100644 --- a/src/test/java/org/taniwha/security/FileFilterTest.java +++ b/src/test/java/org/taniwha/security/FileFilterTest.java @@ -165,5 +165,149 @@ void isFileInvalid_multipartFile_withDisallowedTxtExtension_shouldReturnTrue() { assertThat(fileFilter.isFileInvalid(file)).isTrue(); } + + // ------------------------------------------------------------------------- + // isFileInvalid(Path) – disallowed content branches + // ------------------------------------------------------------------------- + + @Test + void isFileInvalid_path_csvWithDisallowedContent_returnsTrue() throws IOException { + Path csvFile = tempDir.resolve("bad.csv"); + Files.writeString(csvFile, "col1,col2\n,value", StandardCharsets.UTF_8); + + assertThat(fileFilter.isFileInvalid(csvFile)).isTrue(); + } + + @Test + void isFileInvalid_path_csvWithJavascriptScheme_returnsTrue() throws IOException { + Path csvFile = tempDir.resolve("js.csv"); + Files.writeString(csvFile, "col1\njavascript:void(0)", StandardCharsets.UTF_8); + + assertThat(fileFilter.isFileInvalid(csvFile)).isTrue(); + } + + @Test + void isFileInvalid_path_xlsxWithDisallowedContent_returnsTrue() throws Exception { + // Build a minimal xlsx (zip with STORED/uncompressed entries) so the raw + // bytes of the file contain the disallowed pattern and scanXlsxHead finds it. + Path xlsxFile = tempDir.resolve("bad.xlsx"); + byte[] content = "javascript:void(0) evil content here".getBytes(StandardCharsets.UTF_8); + java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(content); + try (java.util.zip.ZipOutputStream zos = new java.util.zip.ZipOutputStream(Files.newOutputStream(xlsxFile))) { + java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("xl/worksheets/sheet1.xml"); + entry.setMethod(java.util.zip.ZipEntry.STORED); + entry.setSize(content.length); + entry.setCompressedSize(content.length); + entry.setCrc(crc.getValue()); + zos.putNextEntry(entry); + zos.write(content); + zos.closeEntry(); + } + + assertThat(fileFilter.isFileInvalid(xlsxFile)).isTrue(); + } + + @Test + void isFileInvalid_path_xlsxWithCleanContent_returnsFalse() throws Exception { + // Build a minimal xlsx (zip) with clean content + Path xlsxFile = tempDir.resolve("clean.xlsx"); + byte[] content = "1" + .getBytes(StandardCharsets.UTF_8); + java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(content); + try (java.util.zip.ZipOutputStream zos = new java.util.zip.ZipOutputStream(Files.newOutputStream(xlsxFile))) { + java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("xl/worksheets/sheet1.xml"); + entry.setMethod(java.util.zip.ZipEntry.STORED); + entry.setSize(content.length); + entry.setCompressedSize(content.length); + entry.setCrc(crc.getValue()); + zos.putNextEntry(entry); + zos.write(content); + zos.closeEntry(); + } + + assertThat(fileFilter.isFileInvalid(xlsxFile)).isFalse(); + } + + // ------------------------------------------------------------------------- + // isFileInvalid(MultipartFile) – disallowed content in CSV + // ------------------------------------------------------------------------- + + @Test + void isFileInvalid_multipartCsv_withDisallowedContent_returnsTrue() { + // Use the exact patterns from DisallowedContentPatterns + MultipartFile file = new MockMultipartFile( + "file", + "data.csv", + "text/csv", + "name\n".getBytes(StandardCharsets.UTF_8) + ); + + assertThat(fileFilter.isFileInvalid(file)).isTrue(); + } + + @Test + void isFileInvalid_multipartCsv_withVbscript_returnsTrue() { + MultipartFile file = new MockMultipartFile( + "file", + "data.csv", + "text/csv", + "col\nvbscript:run()".getBytes(StandardCharsets.UTF_8) + ); + + assertThat(fileFilter.isFileInvalid(file)).isTrue(); + } + + // ------------------------------------------------------------------------- + // getExtension edge cases (via isFileInvalid dispatching) + // ------------------------------------------------------------------------- + + @Test + void isFileInvalid_path_noExtension_returnsTrue() throws IOException { + // A file with no extension is not in AllowedExtensions → invalid + Path noExt = tempDir.resolve("noextension"); + Files.writeString(noExt, "data", StandardCharsets.UTF_8); + + assertThat(fileFilter.isFileInvalid(noExt)).isTrue(); + } + + @Test + void isFileInvalid_multipartFile_noExtension_returnsTrue() { + MultipartFile file = new MockMultipartFile( + "file", + "noextension", + "application/octet-stream", + "data".getBytes(StandardCharsets.UTF_8) + ); + + assertThat(fileFilter.isFileInvalid(file)).isTrue(); + } + + // ------------------------------------------------------------------------- + // validate(Path) with disallowed content – expect exception + // ------------------------------------------------------------------------- + + @Test + void validate_path_withDisallowedContent_shouldThrow() throws IOException { + Path csvFile = tempDir.resolve("inject.csv"); + Files.writeString(csvFile, "col\n", StandardCharsets.UTF_8); + + assertThatThrownBy(() -> fileFilter.validate(csvFile)) + .isInstanceOf(org.taniwha.exception.InvalidFileException.class); + } + + @Test + void validate_multipartFile_withDisallowedContent_shouldThrow() { + MultipartFile file = new MockMultipartFile( + "file", + "attack.csv", + "text/csv", + "col\n".getBytes(StandardCharsets.UTF_8) + ); + + assertThatThrownBy(() -> fileFilter.validate(file)) + .isInstanceOf(org.taniwha.exception.InvalidFileException.class); + } } diff --git a/src/test/java/org/taniwha/service/AnalyticsAuditServiceTest.java b/src/test/java/org/taniwha/service/AnalyticsAuditServiceTest.java new file mode 100644 index 0000000..914a7fb --- /dev/null +++ b/src/test/java/org/taniwha/service/AnalyticsAuditServiceTest.java @@ -0,0 +1,202 @@ +package org.taniwha.service; + +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.security.authentication.UsernamePasswordAuthenticationToken; +import org.springframework.security.core.context.SecurityContextHolder; + +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for {@link AnalyticsAuditService}. + * + *

Log output is captured via a Logback {@link ListAppender} attached to the + * {@code AUDIT} logger so each test can assert on the structured fields that + * are written per operation. + */ +class AnalyticsAuditServiceTest { + + private AnalyticsAuditService service; + private ListAppender listAppender; + private Logger auditLogger; + + @BeforeEach + void setUp() { + service = new AnalyticsAuditService(); + + // Attach a capturing appender to the AUDIT logger + auditLogger = (Logger) LoggerFactory.getLogger("AUDIT"); + listAppender = new ListAppender<>(); + listAppender.start(); + auditLogger.addAppender(listAppender); + + // Start each test with a clean security context + SecurityContextHolder.clearContext(); + } + + @AfterEach + void tearDown() { + auditLogger.detachAppender(listAppender); + SecurityContextHolder.clearContext(); + } + + // ------------------------------------------------------------------------- + // logRequest(op, files) – multi-file, no filters + // ------------------------------------------------------------------------- + + @Test + void logRequest_multiFile_noFilters_containsOpAndFiles() { + service.logRequest("PROCESS", List.of("a.csv", "b.csv")); + + String msg = singleMessage(); + assertThat(msg).contains("op=PROCESS"); + assertThat(msg).contains("a.csv"); + assertThat(msg).contains("b.csv"); + assertThat(msg).contains("ts="); + assertThat(msg).contains("principal=anonymous"); + } + + @Test + void logRequest_multiFile_noFilters_anonymousPrincipal() { + service.logRequest("PROCESS", List.of("x.csv")); + + assertThat(singleMessage()).contains("principal=anonymous"); + } + + // ------------------------------------------------------------------------- + // logRequest(op, files, hasFilters) – multi-file with filter flag + // ------------------------------------------------------------------------- + + @Test + void logRequest_multiFile_withFiltersTrue_containsFiltersField() { + service.logRequest("FILTER", List.of("data.csv"), true); + + String msg = singleMessage(); + assertThat(msg).contains("op=FILTER"); + assertThat(msg).contains("filters=true"); + assertThat(msg).contains("data.csv"); + } + + @Test + void logRequest_multiFile_withFiltersFalse_containsFiltersFalse() { + service.logRequest("FILTER", List.of("data.csv"), false); + + assertThat(singleMessage()).contains("filters=false"); + } + + // ------------------------------------------------------------------------- + // logRequest(op, fileName) – single file, no filters + // ------------------------------------------------------------------------- + + @Test + void logRequest_singleFile_noFilters_containsOpAndFile() { + service.logRequest("REPROCESS", "patient.csv"); + + String msg = singleMessage(); + assertThat(msg).contains("op=REPROCESS"); + assertThat(msg).contains("patient.csv"); + assertThat(msg).contains("principal=anonymous"); + } + + // ------------------------------------------------------------------------- + // logRequest(op, fileName, filters) – single file with filter map + // ------------------------------------------------------------------------- + + @Test + void logRequest_singleFile_withFilters_logsFilterKeysNotValues() { + Map filters = Map.of("diagnosis", "cancer", "age", 50); + service.logRequest("FILTER", "records.csv", filters); + + String msg = singleMessage(); + assertThat(msg).contains("filters=true"); + // Keys logged + assertThat(msg).contains("diagnosis").contains("age"); + // Values must NOT appear in the filterKeys field (scope check to avoid + // false positives from numeric substrings in the timestamp) + int start = msg.indexOf("filterKeys=[") + "filterKeys=[".length(); + int end = msg.indexOf("]", start); + String filterKeysPart = msg.substring(start, end); + assertThat(filterKeysPart).doesNotContain("cancer").doesNotContain("50"); + } + + @Test + void logRequest_singleFile_nullFilters_loggedAsNoFilters() { + service.logRequest("FILTER", "records.csv", null); + + String msg = singleMessage(); + assertThat(msg).contains("filters=false"); + } + + @Test + void logRequest_singleFile_emptyFilters_loggedAsNoFilters() { + service.logRequest("FILTER", "records.csv", Map.of()); + + String msg = singleMessage(); + assertThat(msg).contains("filters=false"); + } + + // ------------------------------------------------------------------------- + // logResponse + // ------------------------------------------------------------------------- + + @Test + void logResponse_containsAllFields() { + service.logResponse("PROCESS", "dataset.csv", 1234L, 2); + + String msg = singleMessage(); + assertThat(msg).contains("op=PROCESS"); + assertThat(msg).contains("file=dataset.csv"); + assertThat(msg).contains("records=1234"); + assertThat(msg).contains("suppressed=2"); + assertThat(msg).contains("principal=anonymous"); + } + + @Test + void logResponse_zeroSuppressed_recordedCorrectly() { + service.logResponse("FILTER", "f.csv", 50L, 0); + + assertThat(singleMessage()).contains("suppressed=0"); + } + + // ------------------------------------------------------------------------- + // Principal resolution + // ------------------------------------------------------------------------- + + @Test + void logRequest_authenticatedPrincipal_recordedInAuditEntry() { + var auth = new UsernamePasswordAuthenticationToken("dr.smith", null, List.of()); + SecurityContextHolder.getContext().setAuthentication(auth); + + service.logRequest("PROCESS", List.of("records.csv")); + + assertThat(singleMessage()).contains("principal=dr.smith"); + } + + @Test + void logResponse_authenticatedPrincipal_recordedInAuditEntry() { + var auth = new UsernamePasswordAuthenticationToken("researcher1", null, List.of()); + SecurityContextHolder.getContext().setAuthentication(auth); + + service.logResponse("FILTER", "data.csv", 100L, 0); + + assertThat(singleMessage()).contains("principal=researcher1"); + } + + // ------------------------------------------------------------------------- + // Helper + // ------------------------------------------------------------------------- + + private String singleMessage() { + List events = listAppender.list; + assertThat(events).as("expected exactly one audit log event").hasSize(1); + return events.get(0).getFormattedMessage(); + } +} diff --git a/src/test/java/org/taniwha/service/AnalyticsServiceTest.java b/src/test/java/org/taniwha/service/AnalyticsServiceTest.java index 4c46d28..fd7cdd7 100644 --- a/src/test/java/org/taniwha/service/AnalyticsServiceTest.java +++ b/src/test/java/org/taniwha/service/AnalyticsServiceTest.java @@ -1,16 +1,25 @@ package org.taniwha.service; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.MockitoAnnotations; +import org.springframework.test.util.ReflectionTestUtils; import org.taniwha.dto.AnalyticsResponseDTO; import org.taniwha.dto.FileFilters; +import org.taniwha.service.jobs.AnalyticsProcessingJobs; +import java.io.FileOutputStream; import java.io.IOException; +import java.lang.reflect.Method; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.time.LocalDate; @@ -19,6 +28,8 @@ import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.*; @@ -32,6 +43,15 @@ class AnalyticsServiceTest { @Mock private FileService fileService; + @Mock + private AnalyticsProcessingJobs analyticsProcessingJobs; + + @Mock + private DisclosureControlService disclosureControlService; + + @Mock + private AnalyticsAuditService analyticsAuditService; + @InjectMocks private AnalyticsService analyticsService; @@ -337,4 +357,359 @@ void filterDataByName_success() throws Exception { assertThat(dto.getContinuousFeatures()) .anySatisfy(fs -> assertThat(fs.getFeatureName()).isEqualTo("num")); } + + @TempDir + Path tempDir; + + // ------------------------------------------------------------------------- + // isAnyHugeForDiscovery – real temp CSV files + // ------------------------------------------------------------------------- + + @Test + void isAnyHugeForDiscovery_smallFile_returnsFalse() throws IOException { + Path csv = tempDir.resolve("small.csv"); + StringBuilder sb = new StringBuilder("col\n"); + for (int i = 0; i < 10; i++) sb.append("val").append(i).append("\n"); + Files.writeString(csv, sb); + + when(fileService.getDatasetFilePath("small.csv")).thenReturn(csv.toString()); + + assertThat(analyticsService.isAnyHugeForDiscovery(List.of("small.csv"))).isFalse(); + } + + @Test + void isAnyHugeForDiscovery_bigFile_returnsTrue() throws IOException { + // Create a file > HUGE_BYTES_THRESHOLD (1 MB) + Path csv = tempDir.resolve("big.csv"); + byte[] chunk = ("longvalue,anothervalue\n").getBytes(); + try (var out = Files.newOutputStream(csv)) { + int written = 0; + while (written < 1_100_000) { + out.write(chunk); + written += chunk.length; + } + } + + when(fileService.getDatasetFilePath("big.csv")).thenReturn(csv.toString()); + + assertThat(analyticsService.isAnyHugeForDiscovery(List.of("big.csv"))).isTrue(); + } + + @Test + void isAnyHugeForDiscovery_manyRows_returnsTrue() throws IOException { + // Create a file with > HUGE_ROWS_THRESHOLD (5000) rows but small total size + Path csv = tempDir.resolve("manyrows.csv"); + StringBuilder sb = new StringBuilder("col\n"); + for (int i = 0; i < 6000; i++) sb.append("v\n"); + Files.writeString(csv, sb); + + when(fileService.getDatasetFilePath("manyrows.csv")).thenReturn(csv.toString()); + + assertThat(analyticsService.isAnyHugeForDiscovery(List.of("manyrows.csv"))).isTrue(); + } + + @Test + void isAnyHugeForDiscovery_fileNotFound_returnsFalse() { + when(fileService.getDatasetFilePath("ghost.csv")).thenReturn("/nonexistent/ghost.csv"); + + // Should catch IOException and return false gracefully + assertThat(analyticsService.isAnyHugeForDiscovery(List.of("ghost.csv"))).isFalse(); + } + + @Test + void isAnyHugeForDiscovery_emptyList_returnsFalse() { + assertThat(analyticsService.isAnyHugeForDiscovery(List.of())).isFalse(); + } + + @Test + void isAnyHugeForDiscovery_nonCsvNonXlsxFile_returnsFalse() throws IOException { + Path ttl = tempDir.resolve("meta.ttl"); + Files.writeString(ttl, "content"); + when(fileService.getDatasetFilePath("meta.ttl")).thenReturn(ttl.toString()); + + assertThat(analyticsService.isAnyHugeForDiscovery(List.of("meta.ttl"))).isFalse(); + } + + // ------------------------------------------------------------------------- + // filterMultipleFilesByName – exception paths in future.get() + // ------------------------------------------------------------------------- + + @Test + void filterMultipleFilesByName_executionException_returnsErrorMessage() throws Exception { + AnalyticsService spySvc = Mockito.spy(analyticsService); + FileFilters ff = new FileFilters(); + ff.setFileName("f3"); + ff.setFilters(null); + + java.util.concurrent.CompletableFuture failedFuture = + new java.util.concurrent.CompletableFuture<>(); + failedFuture.completeExceptionally(new RuntimeException("processing error")); + + doReturn(failedFuture).when(spySvc).processSingleFileOnDisk("f3"); + + var results = spySvc.filterMultipleFilesByName(List.of(ff)); + assertThat(results).hasSize(1); + assertThat(results.get(0).getMessage()).contains("Error:"); + } + + // ------------------------------------------------------------------------- + // processRecord coverage via processDatasetsOnDisk – various data types + // ------------------------------------------------------------------------- + + @Test + void processDatasetsOnDisk_dateColumn_detectsDateFeature() throws IOException { + String filename = "dates.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + doAnswer(inv -> { + java.util.function.Consumer> consumer = inv.getArgument(1); + consumer.accept(Map.of("eventDate", "2023-01-15")); + consumer.accept(Map.of("eventDate", "2023-06-20")); + return null; + }).when(dataProcessingService).streamRows(eq(Paths.get("/tmp/" + filename)), any()); + + var results = analyticsService.processDatasetsOnDisk(List.of(filename)); + + assertThat(results).hasSize(1); + AnalyticsResponseDTO dto = results.get(0); + assertThat(dto.getDateFeatures()).isNotEmpty(); + } + + @Test + void processDatasetsOnDisk_nullAndEmptyValues_incrementsMissingCount() throws IOException { + String filename = "nulls.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + doAnswer(inv -> { + java.util.function.Consumer> consumer = inv.getArgument(1); + // null value via empty string + Map row = new java.util.HashMap<>(); + row.put("col", null); + consumer.accept(row); + Map row2 = new java.util.HashMap<>(); + row2.put("col", " "); + consumer.accept(row2); + Map row3 = new java.util.HashMap<>(); + row3.put("col", "NULL"); + consumer.accept(row3); + return null; + }).when(dataProcessingService).streamRows(eq(Paths.get("/tmp/" + filename)), any()); + + var results = analyticsService.processDatasetsOnDisk(List.of(filename)); + + assertThat(results).hasSize(1); + AnalyticsResponseDTO dto = results.get(0); + // All values were missing so the feature may have no data → no data message or empty feature + assertThat(dto.getMessage()).isNotNull(); + } + + @Test + void processDatasetsOnDisk_mixedColumnBecomesDate_overridesContinuous() throws IOException { + // First row makes it look continuous, second a valid date → date takes priority + String filename = "mixed.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + doAnswer(inv -> { + java.util.function.Consumer> consumer = inv.getArgument(1); + consumer.accept(Map.of("col", "2023-01-01")); + consumer.accept(Map.of("col", "2023-02-01")); + consumer.accept(Map.of("col", "2023-03-01")); + return null; + }).when(dataProcessingService).streamRows(eq(Paths.get("/tmp/" + filename)), any()); + + var results = analyticsService.processDatasetsOnDisk(List.of(filename)); + assertThat(results).hasSize(1); + assertThat(results.get(0).getDateFeatures()).isNotEmpty(); + } + + // ------------------------------------------------------------------------- + // filterMultipleFilesByName – ExecutionException path + // ------------------------------------------------------------------------- + + @Test + void filterMultipleFilesByName_executionException_returnsErrorDto() throws Exception { + String filename = "err.csv"; + when(fileService.getDatasetFilePath(filename)) + .thenThrow(new RuntimeException("Disk not found")); + + FileFilters ff = new FileFilters(); + ff.setFileName(filename); + ff.setFilters(null); + List results = analyticsService.filterMultipleFilesByName(List.of(ff)); + + assertThat(results).hasSize(1); + assertThat(results.get(0).getMessage()).contains("Error"); + } + + // ------------------------------------------------------------------------- + // processRecord – NULL string and whitespace-only values → missing count + // ------------------------------------------------------------------------- + + @Test + void processDatasetsOnDisk_nullStringValue_countedAsMissing() throws IOException { + String filename = "nullstr.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + doAnswer(inv -> { + java.util.function.Consumer> consumer = inv.getArgument(1); + Map row = new java.util.HashMap<>(); + row.put("col", "NULL"); + consumer.accept(row); + consumer.accept(Map.of("col", "valid")); + return null; + }).when(dataProcessingService).streamRows(eq(Paths.get("/tmp/" + filename)), any()); + + var results = analyticsService.processDatasetsOnDisk(List.of(filename)); + assertThat(results).hasSize(1); + } + + // ------------------------------------------------------------------------- + // processRecord – forced mapping (override continuous + non-parseable value) + // ------------------------------------------------------------------------- + + @Test + void recalculateFeatureAsType_forcedMapping_categorical_nonParseableValue() throws Exception { + String filename = "cat.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + when(dataProcessingService.extractDataFromPath(Paths.get("/tmp/" + filename))) + .thenReturn(List.of( + Map.of("label", "cat_A"), + Map.of("label", "cat_B"), + Map.of("label", "cat_A") + )); + + // Override feature as "continuous" → non-numeric "cat_A" triggers forced-mapping path + var result = analyticsService.recalculateFeatureAsTypeFromDisk(filename, "label", "continuous").get(); + + assertThat(result).isNotNull(); + } + + // ------------------------------------------------------------------------- + // processDatasetsOnDisk – categorical data with combinations + // ------------------------------------------------------------------------- + + @Test + void processDatasetsOnDisk_twoCategoricalColumns_buildsCombinationCounts() throws IOException { + String filename = "cats2.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + doAnswer(inv -> { + java.util.function.Consumer> consumer = inv.getArgument(1); + consumer.accept(Map.of("color", "red", "shape", "circle")); + consumer.accept(Map.of("color", "blue", "shape", "square")); + consumer.accept(Map.of("color", "red", "shape", "circle")); + consumer.accept(Map.of("color", "red", "shape", "circle")); + consumer.accept(Map.of("color", "red", "shape", "circle")); + consumer.accept(Map.of("color", "red", "shape", "circle")); + return null; + }).when(dataProcessingService).streamRows(eq(Paths.get("/tmp/" + filename)), any()); + + var results = analyticsService.processDatasetsOnDisk(List.of(filename)); + assertThat(results).hasSize(1); + assertThat(results.get(0).getCategoricalFeatures()).isNotNull(); + } + + // ------------------------------------------------------------------------- + // processDatasetsOnDisk – feature name with " (" suffix → getOriginalFeatureName strips it + // ------------------------------------------------------------------------- + + @Test + void recalculateFeatureAsType_featureNameWithParenthesis_stripped() throws Exception { + String filename = "paren.csv"; + when(fileService.getDatasetFilePath(filename)).thenReturn("/tmp/" + filename); + + when(dataProcessingService.extractDataFromPath(Paths.get("/tmp/" + filename))) + .thenReturn(List.of( + Map.of("score", "10"), + Map.of("score", "20"), + Map.of("score", "30") + )); + + // featureName "score (% coverage)" → getOriginalFeatureName returns "score" + var result = analyticsService.recalculateFeatureAsTypeFromDisk(filename, "score (% coverage)", "continuous").get(); + assertThat(result).isNotNull(); + } + + @Test + void privatePercent_handlesBoundsAndZeroTotal() throws Exception { + Method m = AnalyticsService.class.getDeclaredMethod("percent", long.class, long.class); + m.setAccessible(true); + + assertThat((int) m.invoke(analyticsService, 5L, 0L)).isEqualTo(0); + assertThat((int) m.invoke(analyticsService, 0L, 10L)).isEqualTo(0); + assertThat((int) m.invoke(analyticsService, 15L, 10L)).isEqualTo(100); + } + + @Test + void privateEstimateRowsFast_csvCountsLinesMinusHeader() throws Exception { + Path csv = Files.createTempFile("analytics_rows", ".csv"); + Files.writeString(csv, "h1;h2\n1;2\n3;4\n"); + + Method m = AnalyticsService.class.getDeclaredMethod("estimateRowsFast", Path.class); + m.setAccessible(true); + long rows = (long) m.invoke(analyticsService, csv); + + assertThat(rows).isEqualTo(2L); + } + + @Test + void privateEstimateRowsFast_xlsxReadsSheetDimensions() throws Exception { + Path xlsx = Files.createTempFile("analytics_rows", ".xlsx"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sh = wb.createSheet("S1"); + Row h = sh.createRow(0); + h.createCell(0).setCellValue("age"); + sh.createRow(1).createCell(0).setCellValue(10); + sh.createRow(2).createCell(0).setCellValue(20); + try (FileOutputStream fos = new FileOutputStream(xlsx.toFile())) { + wb.write(fos); + } + } + + Method m = AnalyticsService.class.getDeclaredMethod("estimateRowsFast", Path.class); + m.setAccessible(true); + long rows = (long) m.invoke(analyticsService, xlsx); + + assertThat(rows).isGreaterThanOrEqualTo(2L); + } + + @Test + void shutdown_executorTerminatesGracefully() throws Exception { + ExecutorService mockExec = mock(ExecutorService.class); + when(mockExec.awaitTermination(60, TimeUnit.SECONDS)).thenReturn(true); + + ReflectionTestUtils.setField(analyticsService, "discoveryJobExecutor", mockExec); + analyticsService.shutdown(); + + verify(mockExec).shutdown(); + verify(mockExec, never()).shutdownNow(); + } + + @Test + void shutdown_executorRequiresForcedShutdown() throws Exception { + ExecutorService mockExec = mock(ExecutorService.class); + when(mockExec.awaitTermination(60, TimeUnit.SECONDS)).thenReturn(false); + when(mockExec.awaitTermination(10, TimeUnit.SECONDS)).thenReturn(false); + + ReflectionTestUtils.setField(analyticsService, "discoveryJobExecutor", mockExec); + analyticsService.shutdown(); + + verify(mockExec).shutdown(); + verify(mockExec).shutdownNow(); + } + + @Test + void shutdown_interruptedDuringAwait_forcesShutdownAndPreservesInterrupt() throws Exception { + ExecutorService mockExec = mock(ExecutorService.class); + when(mockExec.awaitTermination(60, TimeUnit.SECONDS)).thenThrow(new InterruptedException("x")); + + ReflectionTestUtils.setField(analyticsService, "discoveryJobExecutor", mockExec); + analyticsService.shutdown(); + + verify(mockExec).shutdown(); + verify(mockExec).shutdownNow(); + assertThat(Thread.currentThread().isInterrupted()).isTrue(); + Thread.interrupted(); // clear interrupt flag for subsequent tests + } } diff --git a/src/test/java/org/taniwha/service/DataCleaningServiceTest.java b/src/test/java/org/taniwha/service/DataCleaningServiceTest.java index da5b80b..e37a0e4 100644 --- a/src/test/java/org/taniwha/service/DataCleaningServiceTest.java +++ b/src/test/java/org/taniwha/service/DataCleaningServiceTest.java @@ -14,8 +14,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.doNothing; -import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.*; class DataCleaningServiceTest { @@ -1080,4 +1079,438 @@ void standardizeCase_sentenceCase() { var result = svc.standardizeCase(new ArrayList<>(List.of(row)), "sentence"); assertThat(result.get(0).get("text")).isEqualTo("Hello world test"); } + + // ========== extractNumericColumns ========== + + @Test + void standardizeNumericInPlace_withTripleColonSyntax_parsesColumnName() { + Map row = new HashMap<>(Map.of("price", "3.14159")); + // extractNumericColumns strips the "file:::" prefix via standardizeNumericInPlace + svc.standardizeNumericInPlace(row, Set.of("price"), "double"); + assertThat(row.get("price")).isEqualTo("3.14159"); + } + + @Test + void standardizeNumericInPlace_int_trunc_mode() { + Map row = new HashMap<>(Map.of("val", "9.9")); + svc.standardizeNumericInPlace(row, Set.of("val"), "int_trunc"); + assertThat(row.get("val")).isEqualTo("9"); + } + + @Test + void standardizeNumericInPlace_int_round_mode() { + Map row = new HashMap<>(Map.of("val", "9.6")); + svc.standardizeNumericInPlace(row, Set.of("val"), "int_round"); + assertThat(row.get("val")).isEqualTo("10"); + } + + @Test + void standardizeNumericInPlace_unknownMode_noChange() { + Map row = new HashMap<>(Map.of("val", "5.0")); + svc.standardizeNumericInPlace(row, Set.of("val"), "hex"); + assertThat(row.get("val")).isEqualTo("5.0"); // unchanged + } + + @Test + void standardizeNumericInPlace_nonNumericValue_skipped() { + Map row = new HashMap<>(Map.of("val", "not-a-number")); + svc.standardizeNumericInPlace(row, Set.of("val"), "double"); + assertThat(row.get("val")).isEqualTo("not-a-number"); // unchanged + } + + @Test + void standardizeNumericInPlace_missingColumn_skipped() { + Map row = new HashMap<>(Map.of("other", "5.0")); + svc.standardizeNumericInPlace(row, Set.of("price"), "double"); + assertThat(row.get("other")).isEqualTo("5.0"); // unchanged + } + + // ========== normalizeURLs ========== + + @Test + void normalizeURLs_lowercasesAndStripsTrailingSlash() { + Map row = new HashMap<>(Map.of("url", "https://example.com/path/")); + var result = svc.normalizeURLs(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("url")).isEqualTo("https://example.com/path"); + } + + @Test + void normalizeURLs_noTrailingSlash_noChange() { + Map row = new HashMap<>(Map.of("url", "https://example.com")); + var result = svc.normalizeURLs(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("url")).isEqualTo("https://example.com"); + } + + @Test + void normalizeURLs_nonHttpValue_unchanged() { + Map row = new HashMap<>(Map.of("col", "ftp://example.com")); + var result = svc.normalizeURLs(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("col")).isEqualTo("ftp://example.com"); + } + + // ========== validateEmails ========== + + @Test + void validateEmails_invalidEmail_clearedToEmpty() { + Map row = new HashMap<>(Map.of("email", "not-valid@")); + var result = svc.validateEmails(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("email")).isEmpty(); + } + + @Test + void validateEmails_validEmail_unchanged() { + Map row = new HashMap<>(Map.of("email", "user@example.com")); + var result = svc.validateEmails(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("email")).isEqualTo("user@example.com"); + } + + @Test + void validateEmails_noAtSign_unchanged() { + Map row = new HashMap<>(Map.of("email", "notanemail")); + var result = svc.validateEmails(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("email")).isEqualTo("notanemail"); // no @ → untouched + } + + // ========== standardizePhoneNumbers (e164 and national paths) ========== + + @Test + void standardizePhoneNumbers_e164Format() { + Map row = new HashMap<>(Map.of("phone", "555-867-5309")); + var result = svc.standardizePhoneNumbers(new ArrayList<>(List.of(row)), "e164", "+1"); + assertThat(result.get(0).get("phone")).startsWith("+1"); + } + + @Test + void standardizePhoneNumbers_tooShortDigits_unchanged() { + Map row = new HashMap<>(Map.of("phone", "123")); + var result = svc.standardizePhoneNumbers(new ArrayList<>(List.of(row)), "national", "+1"); + assertThat(result.get(0).get("phone")).isEqualTo("123"); // too short, not replaced + } + + @Test + void standardizePhoneNumbers_nullFormat_usesNational() { + Map row = new HashMap<>(Map.of("phone", "5558675309")); + var result = svc.standardizePhoneNumbers(new ArrayList<>(List.of(row)), null, null); + assertThat(result.get(0).get("phone")).contains("(555)"); + } + + // ========== convertDataTypes additional paths ========== + + @Test + void convertDataTypes_boolean() { + Map row = new HashMap<>(Map.of("active", "true")); + var result = svc.convertDataTypes(new ArrayList<>(List.of(row)), Map.of("active", "boolean")); + assertThat(result.get(0).get("active")).isEqualTo("true"); + } + + @Test + void convertDataTypes_string() { + Map row = new HashMap<>(Map.of("name", "Alice")); + var result = svc.convertDataTypes(new ArrayList<>(List.of(row)), Map.of("name", "string")); + assertThat(result.get(0).get("name")).isEqualTo("Alice"); + } + + @Test + void convertDataTypes_unknownType_unchanged() { + Map row = new HashMap<>(Map.of("col", "value")); + var result = svc.convertDataTypes(new ArrayList<>(List.of(row)), Map.of("col", "hex")); + assertThat(result.get(0).get("col")).isEqualTo("value"); + } + + @Test + void convertDataTypes_parseFailure_valueUnchanged() { + Map row = new HashMap<>(Map.of("num", "not-a-number")); + var result = svc.convertDataTypes(new ArrayList<>(List.of(row)), Map.of("num", "integer")); + assertThat(result.get(0).get("num")).isEqualTo("not-a-number"); // parse failed → unchanged + } + + // ========== fillMissingValues – mean, median, mode, forward, backward, interpolate ========== + + @Test + void fillMissingValues_mean_fillsNulls() { + Map r1 = new HashMap<>(Map.of("v", "10")); + Map r2 = new HashMap<>(); + r2.put("v", null); + Map r3 = new HashMap<>(Map.of("v", "20")); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3)), "mean", null, null); + assertThat(result.get(1).get("v")).isEqualTo("15.0"); + } + + @Test + void fillMissingValues_median_oddSize() { + Map r1 = new HashMap<>(Map.of("v", "1")); + Map r2 = new HashMap<>(); + r2.put("v", null); + Map r3 = new HashMap<>(Map.of("v", "3")); + Map r4 = new HashMap<>(Map.of("v", "5")); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3, r4)), "median", null, null); + assertThat(result.get(1).get("v")).isNotNull(); + } + + @Test + void fillMissingValues_median_evenSize() { + Map r1 = new HashMap<>(Map.of("v", "10")); + Map r2 = new HashMap<>(Map.of("v", "20")); + Map r3 = new HashMap<>(); + r3.put("v", null); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3)), "median", null, null); + assertThat(result.get(2).get("v")).isEqualTo("15.0"); // (10+20)/2 + } + + @Test + void fillMissingValues_mode_fillsWithMostFrequent() { + Map r1 = new HashMap<>(Map.of("v", "A")); + Map r2 = new HashMap<>(Map.of("v", "A")); + Map r3 = new HashMap<>(Map.of("v", "B")); + Map r4 = new HashMap<>(); + r4.put("v", null); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3, r4)), "mode", null, null); + assertThat(result.get(3).get("v")).isEqualTo("A"); + } + + @Test + void fillMissingValues_forward_propagatesLastValue() { + Map r1 = new HashMap<>(Map.of("v", "hello")); + Map r2 = new HashMap<>(); + r2.put("v", null); + Map r3 = new HashMap<>(); + r3.put("v", ""); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3)), "forward", null, null); + assertThat(result.get(1).get("v")).isEqualTo("hello"); + assertThat(result.get(2).get("v")).isEqualTo("hello"); + } + + @Test + void fillMissingValues_backward_propagatesNextValue() { + Map r1 = new HashMap<>(); + r1.put("v", null); + Map r2 = new HashMap<>(Map.of("v", "world")); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2)), "backward", null, null); + assertThat(result.get(0).get("v")).isEqualTo("world"); + } + + @Test + void fillMissingValues_interpolate_fillsNumericGap() { + Map r1 = new HashMap<>(Map.of("v", "0")); + Map r2 = new HashMap<>(); + r2.put("v", null); + Map r3 = new HashMap<>(Map.of("v", "10")); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1, r2, r3)), "interpolate", null, null); + assertThat(result.get(1).get("v")).isEqualTo("5.0"); + } + + @Test + void fillMissingValues_constant_fillsWithGivenValue() { + Map r1 = new HashMap<>(); + r1.put("v", null); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1)), "constant", "N/A", null); + assertThat(result.get(0).get("v")).isEqualTo("N/A"); + } + + @Test + void fillMissingValues_unknownStrategy_noChange() { + Map r1 = new HashMap<>(); + r1.put("v", null); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1)), "unknown_strategy", null, null); + assertThat(result.get(0).get("v")).isNull(); // unchanged + } + + @Test + void fillMissingValues_withTargetColumns_onlyFillsSpecified() { + Map r1 = new HashMap<>(); + r1.put("a", null); + r1.put("b", null); + var result = svc.fillMissingValues(new ArrayList<>(List.of(r1)), "constant", "X", List.of("a")); + assertThat(result.get(0).get("a")).isEqualTo("X"); + assertThat(result.get(0).get("b")).isNull(); // not in target columns + } + + // ========== cosineSimilarity / getTrigrams via mergeSimilarValues ========== + + @Test + void mergeSimilarValues_cosine_mergesSimilarValues() { + // Use very similar strings and lower threshold to ensure merging + Map r1 = new HashMap<>(Map.of("brand", "apple")); + Map r2 = new HashMap<>(Map.of("brand", "apple")); // identical + Map r3 = new HashMap<>(Map.of("brand", "orange")); + var result = svc.mergeSimilarValues( + new ArrayList<>(List.of(r1, r2, r3)), Set.of("brand"), "cosine", 0.5, true, true, "most_frequent"); + // "apple" and "apple" should merge to one canonical; "orange" stays + assertThat(result.stream().map(m -> m.get("brand")).collect(java.util.stream.Collectors.toSet())) + .hasSizeLessThanOrEqualTo(2); + } + + @Test + void mergeSimilarValues_jaroWinkler_mergesSimilar() { + Map r1 = new HashMap<>(Map.of("col", "MARTHA")); + Map r2 = new HashMap<>(Map.of("col", "MARHTA")); // transposed + var result = svc.mergeSimilarValues( + new ArrayList<>(List.of(r1, r2)), Set.of("col"), "jaro_winkler", 0.8, false, false, "shortest"); + assertThat(result.get(0).get("col")).isEqualTo(result.get(1).get("col")); + } + + @Test + void mergeSimilarValues_chooseCanonical_longestStrategy() { + Map r1 = new HashMap<>(Map.of("col", "ab")); + Map r2 = new HashMap<>(Map.of("col", "abc")); // longer + var result = svc.mergeSimilarValues( + new ArrayList<>(List.of(r1, r2)), Set.of("col"), "levenshtein", 0.5, false, false, "longest"); + assertThat(result.stream().map(m -> m.get("col")).toList()).allSatisfy(v -> assertThat(v).isEqualTo("abc")); + } + + @Test + void mergeSimilarValues_chooseCanonical_alphabeticalStrategy() { + Map r1 = new HashMap<>(Map.of("col", "banana")); + Map r2 = new HashMap<>(Map.of("col", "banane")); // close + var result = svc.mergeSimilarValues( + new ArrayList<>(List.of(r1, r2)), Set.of("col"), "levenshtein", 0.7, false, false, "alphabetical"); + // canonical = "banana" (alphabetically first) + assertThat(result.stream().map(m -> m.get("col")).toList()).allSatisfy(v -> assertThat(v).isEqualTo("banana")); + } + + // ========== binData additional edge cases ========== + + @Test + void binData_nullEdges_noChange() { + Map row = new HashMap<>(Map.of("val", "5")); + var result = svc.binData(new ArrayList<>(List.of(row)), "val", null, null); + // Null edges → no binning + assertThat(result.get(0).containsKey("val_binned")).isFalse(); + } + + @Test + void binData_outOfRange_usesOverflowLabel() { + Map row = new HashMap<>(Map.of("age", "200")); + List edges = Arrays.asList(0.0, 18.0, 65.0); + List labels = Arrays.asList("young", "adult"); + var result = svc.binData(new ArrayList<>(List.of(row)), "age", edges, labels); + // Out of range → may get last label or empty + assertThat(result.get(0)).containsKey("age_binned"); + } + + // ========== fixEncoding / looksLikeMojibake ========== + + @Test + void fixEncoding_withMojibake_corrects() { + // "café" encoded as latin1 read as UTF-8 produces mojibake with é + Map row = new HashMap<>(Map.of("text", "caféé")); + var result = svc.fixEncoding(new ArrayList<>(List.of(row))); + assertThat(result).hasSize(1); + // After fix-encoding the mojibake characters should be handled + assertThat(result.get(0).get("text")).isNotNull(); + } + + @Test + void fixEncoding_cleanString_unchanged() { + Map row = new HashMap<>(Map.of("text", "Hello World")); + var result = svc.fixEncoding(new ArrayList<>(List.of(row))); + assertThat(result.get(0).get("text")).isEqualTo("Hello World"); + } + + // ========== splitColumn edge cases ========== + + @Test + void splitColumn_withNullColumnValue_skips() { + Map row = new HashMap<>(); + row.put("full", null); + var result = svc.splitColumn(new ArrayList<>(List.of(row)), "full", ",", null); + assertThat(result.get(0).containsKey("full_part1")).isFalse(); + } + + @Test + void splitColumn_withAutoGeneratedNames() { + Map row = new HashMap<>(Map.of("full", "a,b,c")); + var result = svc.splitColumn(new ArrayList<>(List.of(row)), "full", ",", null); + assertThat(result.get(0).get("full_part1")).isEqualTo("a"); + assertThat(result.get(0).get("full_part2")).isEqualTo("b"); + assertThat(result.get(0).get("full_part3")).isEqualTo("c"); + } + + // ========== normalizeUnicode ========== + + @Test + void normalizeUnicode_nfd_decomposesCharacters() { + Map row = new HashMap<>(Map.of("text", "café")); + var result = svc.normalizeUnicode(new ArrayList<>(List.of(row)), "NFD"); + assertThat(result.get(0).get("text")).isNotNull(); + } + + @Test + void normalizeUnicode_nfkc_form() { + Map row = new HashMap<>(Map.of("text", "fi")); // fi ligature + var result = svc.normalizeUnicode(new ArrayList<>(List.of(row)), "NFKC"); + assertThat(result.get(0).get("text")).isEqualTo("fi"); + } + + @Test + void normalizeUnicode_defaultForm() { + Map row = new HashMap<>(Map.of("text", "test")); + var result = svc.normalizeUnicode(new ArrayList<>(List.of(row)), "NFC"); // default branch + assertThat(result.get(0).get("text")).isEqualTo("test"); + } + + // ========== padValues ========== + + @Test + void padValues_leftPad() { + Map row = new HashMap<>(Map.of("code", "7")); + var result = svc.padValues(new ArrayList<>(List.of(row)), "left", 5, "0"); + assertThat(result.get(0).get("code")).isEqualTo("00007"); + } + + @Test + void padValues_rightPad() { + Map row = new HashMap<>(Map.of("code", "7")); + var result = svc.padValues(new ArrayList<>(List.of(row)), "right", 4, " "); + assertThat(result.get(0).get("code")).isEqualTo("7 "); + } + + // ========== cleanInPlace via public method with TempDir ========== + + @org.junit.jupiter.api.io.TempDir + java.nio.file.Path cleanInPlaceTempDir; + + @Test + void cleanInPlace_withRealFileService_applyAllOps() throws Exception { + java.nio.file.Path tempDir = cleanInPlaceTempDir; + // Build a real FileService pointing to tempDir + org.taniwha.security.FileFilter realFilter = mock(org.taniwha.security.FileFilter.class); + doNothing().when(realFilter).validate(any(java.nio.file.Path.class)); + FileService realFileService = new FileService(realFilter, tempDir.toString()); + DataProcessingService realDPS = new DataProcessingService(realFilter); + CleaningProcessingJobs jobs = new CleaningProcessingJobs(); + DataCleaningService realSvc = new DataCleaningService(realFileService, realDPS, jobs); + + // Create CSV file in datasets folder + java.nio.file.Path ds = tempDir.resolve("datasets"); + java.nio.file.Files.createDirectories(ds); + java.nio.file.Files.writeString(ds.resolve("test.csv"), + "name;email;price\n Alice ;alice@example.com;10\n Alice ;bad@;20\n"); + + org.taniwha.dto.DataCleaningOptionsDTO opts = new org.taniwha.dto.DataCleaningOptionsDTO(); + opts.setTrimWhitespace(true); + opts.setValidateEmails(true); + opts.setStandardizeNumeric(true); + opts.setNumericMode("double"); + opts.setNumericColumns(List.of("test:::price")); + + realSvc.cleanInPlace(org.taniwha.model.FileCategory.DATASETS, "test.csv", opts); + + List lines = java.nio.file.Files.readAllLines(ds.resolve("test.csv")); + assertThat(lines.get(1)).contains("Alice"); // trimmed + } + + @Test + void cleanInPlace_noOptsEnabled_returnsEarly() throws Exception { + // With opts == null, cleanInPlace should return early without touching the file + FileService mockFS = mock(FileService.class); + when(mockFS.resolveExistingFilePath(any(), any())).thenReturn(java.nio.file.Path.of("/tmp/dummy.csv")); + DataProcessingService mockDPS = mock(DataProcessingService.class); + CleaningProcessingJobs jobs = new CleaningProcessingJobs(); + DataCleaningService svcWithMocks = new DataCleaningService(mockFS, mockDPS, jobs); + + // opts == null → anyEnabled returns false → early return + svcWithMocks.cleanInPlace(org.taniwha.model.FileCategory.DATASETS, "dummy.csv", null); + verify(mockDPS, never()).extractDataFromPath(any()); + } } diff --git a/src/test/java/org/taniwha/service/DataProcessingServiceTest.java b/src/test/java/org/taniwha/service/DataProcessingServiceTest.java index 99d7389..6193db7 100644 --- a/src/test/java/org/taniwha/service/DataProcessingServiceTest.java +++ b/src/test/java/org/taniwha/service/DataProcessingServiceTest.java @@ -392,4 +392,283 @@ void extractFilteredDataFromPath_csv_nonNumericCompare_returnsEmpty() throws Exc var out = service.extractFilteredDataFromPath(csv, filters); assertThat(out).isEmpty(); } + + // ------------------------------------------------------------------------- + // evaluateCategorical – non-"equal" type returns false + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_categoricalGreater_noMatch() throws Exception { + Path csv = Files.createTempFile("dps_cat_gt", ".csv"); + Files.writeString(csv, "status\nACTIVE\nINACTIVE\n"); + doNothing().when(fileFilter).validate(csv); + + // "greater" is not a valid categorical check → evaluateCategorical returns false + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "status", Map.of( + "conditions", List.of( + Map.of("type", "greater", "filterType", "categorical", "value", "ACTIVE") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).isEmpty(); + } + + @Test + void extractFilteredDataFromPath_csv_categoricalEqual_matches() throws Exception { + Path csv = Files.createTempFile("dps_cat_eq", ".csv"); + Files.writeString(csv, "status\nACTIVE\nINACTIVE\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "status", Map.of( + "conditions", List.of( + Map.of("type", "equal", "filterType", "categorical", "value", "ACTIVE") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).hasSize(1); + assertThat(out.get(0).get("status")).isEqualTo("ACTIVE"); + } + + // ------------------------------------------------------------------------- + // compareValues – date comparison branch + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_dateGreater_filtersCorrectly() throws Exception { + Path csv = Files.createTempFile("dps_date_gt", ".csv"); + Files.writeString(csv, "dt\n2023-01-01\n2023-06-15\n2024-01-01\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "dt", Map.of( + "conditions", List.of( + Map.of("type", "greater", "filterType", "date", "value", "2023-05-01") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).hasSize(2) + .extracting(m -> m.get("dt")) + .containsExactly("2023-06-15", "2024-01-01"); + } + + // ------------------------------------------------------------------------- + // readXlsxFile(MultipartFile) – via streamRows(multipart, xlsx) + // (readXlsxFile for Path is already covered; this covers the streaming variant) + // ------------------------------------------------------------------------- + + @Test + void streamRows_multipartXlsx_readsAllRows_streamingReader() throws Exception { + // Build an xlsx in-memory using Apache POI + org.apache.poi.xssf.usermodel.XSSFWorkbook wb = new org.apache.poi.xssf.usermodel.XSSFWorkbook(); + org.apache.poi.ss.usermodel.Sheet sheet = wb.createSheet("Sheet1"); + org.apache.poi.ss.usermodel.Row header = sheet.createRow(0); + header.createCell(0).setCellValue("name"); + header.createCell(1).setCellValue("age"); + org.apache.poi.ss.usermodel.Row row1 = sheet.createRow(1); + row1.createCell(0).setCellValue("Alice"); + row1.createCell(1).setCellValue(30); + org.apache.poi.ss.usermodel.Row row2 = sheet.createRow(2); + row2.createCell(0).setCellValue("Bob"); + row2.createCell(1).setCellValue(25); + + java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); + wb.write(bos); + wb.close(); + byte[] xlsxBytes = bos.toByteArray(); + + MockMultipartFile file = new MockMultipartFile( + "file", "people.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + xlsxBytes); + doNothing().when(fileFilter).validate(file); + + List> rows = new ArrayList<>(); + service.streamRows(file, rows::add); + + assertThat(rows).hasSize(2); + assertThat(rows.get(0).get("name")).isEqualTo("Alice"); + assertThat(rows.get(0).get("age")).isEqualTo("30"); + assertThat(rows.get(1).get("name")).isEqualTo("Bob"); + } + + // ------------------------------------------------------------------------- + // applyFilters – "between" date range + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_dateBetween_filtersCorrectly() throws Exception { + Path csv = Files.createTempFile("dps_date_between", ".csv"); + Files.writeString(csv, "dt\n2022-12-01\n2023-03-15\n2023-07-01\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "dt", Map.of( + "conditions", List.of( + Map.of("type", "between", "filterType", "date", + "value", List.of("2023-01-01", "2023-06-30")) + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).hasSize(1) + .first() + .extracting(m -> m.get("dt")) + .isEqualTo("2023-03-15"); + } + + // ------------------------------------------------------------------------- + // applyComplexCondition – multiple conditions combined with AND / OR + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_multiConditionAnd_bothMustMatch() throws Exception { + Path csv = Files.createTempFile("dps_multi_and", ".csv"); + Files.writeString(csv, "score\n5\n10\n15\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "score", Map.of( + "conditions", List.of( + Map.of("type", "greater", "filterType", "continuous", "value", "4"), + Map.of("type", "less", "filterType", "continuous", "value", "12") + ), + "operators", List.of("AND") + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + // 5 and 10 match > 4 AND < 12; 15 does not + assertThat(out).hasSize(2); + assertThat(out).extracting(m -> m.get("score")).containsExactlyInAnyOrder("5", "10"); + } + + @Test + void extractFilteredDataFromPath_csv_multiConditionOr_eitherMustMatch() throws Exception { + Path csv = Files.createTempFile("dps_multi_or", ".csv"); + Files.writeString(csv, "score\n1\n10\n20\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "score", Map.of( + "conditions", List.of( + Map.of("type", "less", "filterType", "continuous", "value", "5"), + Map.of("type", "greater", "filterType", "continuous", "value", "15") + ), + "operators", List.of("OR") + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + // 1 matches (< 5), 20 matches (> 15), 10 doesn't + assertThat(out).hasSize(2); + assertThat(out).extracting(m -> m.get("score")).containsExactlyInAnyOrder("1", "20"); + } + + // ------------------------------------------------------------------------- + // evaluateCondition – categorical NOT equal (returns false) + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_categoricalEqual_matches2() throws Exception { + Path csv = Files.createTempFile("dps_cat", ".csv"); + Files.writeString(csv, "color\nred\nblue\ngreen\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "color", Map.of( + "conditions", List.of( + Map.of("type", "equal", "filterType", "categorical", "value", "blue") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).hasSize(1); + assertThat(out.get(0).get("color")).isEqualTo("blue"); + } + + @Test + void extractFilteredDataFromPath_csv_categoricalNonEqual_noMatch() throws Exception { + Path csv = Files.createTempFile("dps_cat_ne", ".csv"); + Files.writeString(csv, "color\nred\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "color", Map.of( + "conditions", List.of( + Map.of("type", "not_equal", "filterType", "categorical", "value", "red") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + assertThat(out).isEmpty(); // non-equal type → evaluateCategorical returns false → filtered out + } + + // ------------------------------------------------------------------------- + // evaluateCondition – empty/null feature value → returns false + // ------------------------------------------------------------------------- + + @Test + void extractFilteredDataFromPath_csv_emptyFeatureValue_filtered() throws Exception { + Path csv = Files.createTempFile("dps_empty", ".csv"); + Files.writeString(csv, "score\n\n5\n"); + doNothing().when(fileFilter).validate(csv); + + Map filters = Map.of( + "operator", "AND", + "conditions", Map.of( + "score", Map.of( + "conditions", List.of( + Map.of("type", "greater", "filterType", "continuous", "value", "0") + ), + "operators", List.of() + ) + ) + ); + + var out = service.extractFilteredDataFromPath(csv, filters); + // Empty row filtered out → only score=5 + assertThat(out).hasSize(1); + assertThat(out.get(0).get("score")).isEqualTo("5"); + } } diff --git a/src/test/java/org/taniwha/service/DisclosureControlServiceTest.java b/src/test/java/org/taniwha/service/DisclosureControlServiceTest.java new file mode 100644 index 0000000..a2691c2 --- /dev/null +++ b/src/test/java/org/taniwha/service/DisclosureControlServiceTest.java @@ -0,0 +1,493 @@ +package org.taniwha.service; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.taniwha.dto.AnalyticsResponseDTO; +import org.taniwha.statistics.*; + +import java.util.*; + +import static org.assertj.core.api.Assertions.assertThat; + +class DisclosureControlServiceTest { + + private static final int MIN_SUBSET = 3; + private static final int MIN_CELL = 2; + + private DisclosureControlService service; + + @BeforeEach + void setUp() { + service = new DisclosureControlService(MIN_SUBSET, MIN_CELL); + } + + // ------------------------------------------------------------------------- + // Global suppression + // ------------------------------------------------------------------------- + + @Test + void apply_totalRecordsBelowThreshold_suppressesAllFeatures() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(continuousFeature("age", 4))); + response.setCategoricalFeatures(List.of(categoricalFeature("gender", Map.of("M", 2, "F", 2)))); + response.setDateFeatures(List.of(dateFeature("dob", 4))); + response.setCovariances(Map.of("age", Map.of("age", 1.0))); + + int suppressed = service.apply(response, 2); + + assertThat(suppressed).isEqualTo(3); + assertThat(response.getContinuousFeatures()).isEmpty(); + assertThat(response.getCategoricalFeatures()).isEmpty(); + assertThat(response.getDateFeatures()).isEmpty(); + assertThat(response.getOmittedFeatures()).hasSize(3); + assertThat(response.getCovariances()).isEmpty(); + assertThat(response.getPearsonCorrelations()).isEmpty(); + assertThat(response.getSpearmanCorrelations()).isEmpty(); + assertThat(response.getChiSquareTest()).isEmpty(); + } + + @Test + void apply_totalRecordsAtThreshold_doesNotGlobalSuppress() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(continuousFeature("age", 10))); + + service.apply(response, MIN_SUBSET); + + assertThat(response.getContinuousFeatures()).hasSize(1); + } + + // ------------------------------------------------------------------------- + // Continuous feature suppression + // ------------------------------------------------------------------------- + + @Test + void apply_continuousFeatureBelowSubsetThreshold_movedToOmitted() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of( + continuousFeature("weight", MIN_SUBSET - 1), + continuousFeature("height", MIN_SUBSET + 1) + )); + + service.apply(response, 20); + + assertThat(response.getContinuousFeatures()) + .extracting(fs -> fs.getFeatureName()) + .containsExactly("height"); + assertThat(response.getOmittedFeatures()) + .extracting(fs -> fs.getFeatureName()) + .containsExactly("weight"); + } + + @Test + void apply_continuousOutliers_smallGroup_valueSuppressed() { + // Only 1 outlier – below minCellCount (2) → values must be cleared + List outliers = new ArrayList<>(List.of(250.0)); + ContinuousFeatureStatistics feature = new ContinuousFeatureStatistics( + "bp", 20, 0, 0, 20, 60, 250, 90, 10, 80, 90, 100, + List.of(1.0, 2.0), List.of("[60-80]", "[80-100]"), outliers); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(feature)); + + service.apply(response, 20); + + ContinuousFeatureStatistics result = + (ContinuousFeatureStatistics) response.getContinuousFeatures().get(0); + assertThat(result.getOutliers()).as("single outlier must be suppressed").isEmpty(); + // Aggregate stats are unchanged + assertThat(result.getMin()).isEqualTo(60.0); + assertThat(result.getMax()).isEqualTo(250.0); + } + + @Test + void apply_continuousOutliers_largeGroup_valuesPreserved() { + // 3 outliers – >= minCellCount (2) → values must be kept + List outliers = new ArrayList<>(List.of(200.0, 210.0, 220.0)); + ContinuousFeatureStatistics feature = new ContinuousFeatureStatistics( + "bp", 20, 0, 0, 20, 60, 220, 90, 10, 80, 90, 100, + List.of(1.0, 2.0), List.of("[60-80]", "[80-100]"), outliers); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(feature)); + + service.apply(response, 20); + + ContinuousFeatureStatistics result = + (ContinuousFeatureStatistics) response.getContinuousFeatures().get(0); + assertThat(result.getOutliers()) + .as("outlier group >= minCellCount must be preserved for analytics") + .containsExactlyInAnyOrder(200.0, 210.0, 220.0); + } + + @Test + void apply_continuousNoOutliers_unchanged() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(continuousFeature("age", 20))); + + service.apply(response, 20); + + ContinuousFeatureStatistics result = + (ContinuousFeatureStatistics) response.getContinuousFeatures().get(0); + assertThat(result.getOutliers()).isEmpty(); + } + + // ------------------------------------------------------------------------- + // Categorical cell suppression + // ------------------------------------------------------------------------- + + @Test + void apply_categoricalSmallCellsSuppressed() { + Map counts = new HashMap<>(); + counts.put("A", 10); + counts.put("B", 1); // below threshold (1 < 2) + counts.put("C", 1); // below threshold (1 < 2) + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of(categoricalFeature("diagnosis", counts))); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()).hasSize(1); + CategoricalFeatureStatistics result = + (CategoricalFeatureStatistics) response.getCategoricalFeatures().get(0); + assertThat(result.getCategoryCounts()).containsOnlyKeys("A"); + assertThat(result.getMode()).isEqualTo("A"); + } + + @Test + void apply_categoricalAllCellsSuppressed_movedToOmitted() { + Map counts = new HashMap<>(); + counts.put("A", 1); + counts.put("B", 1); // both 1 < minCellCount(2) → entire feature suppressed + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of(categoricalFeature("rare", counts))); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()).isEmpty(); + assertThat(response.getOmittedFeatures()).hasSize(1); + assertThat(response.getOmittedFeatures().get(0).getFeatureName()).isEqualTo("rare"); + } + + @Test + void apply_categoricalNoCellsSuppressed_unchanged() { + Map counts = new HashMap<>(); + counts.put("X", 10); + counts.put("Y", 8); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of(categoricalFeature("status", counts))); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()).hasSize(1); + CategoricalFeatureStatistics result = + (CategoricalFeatureStatistics) response.getCategoricalFeatures().get(0); + assertThat(result.getCategoryCounts()).containsKeys("X", "Y"); + } + + // ------------------------------------------------------------------------- + // Date feature suppression + // ------------------------------------------------------------------------- + + @Test + void apply_dateFeatureBelowSubsetThreshold_movedToOmitted() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setDateFeatures(List.of(dateFeature("admission", MIN_SUBSET - 1))); + + service.apply(response, 20); + + assertThat(response.getDateFeatures()).isEmpty(); + assertThat(response.getOmittedFeatures()).hasSize(1); + assertThat(response.getOmittedFeatures().get(0).getFeatureName()).isEqualTo("admission"); + } + + @Test + void apply_dateHistogramSmallBucketsSuppressed() { + Map histogram = new HashMap<>(); + histogram.put("2020-01-01", 10L); + histogram.put("2020-01-02", 1L); // below threshold + + DateFeatureStatistics feature = new DateFeatureStatistics( + "visit", 20, 0, 0, + "2020-01-01", "2020-01-02", + histogram, List.of(), + "2020-01-01", 1.0, "2020-01-01", "2020-01-01", "2020-01-02"); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setDateFeatures(List.of(feature)); + + service.apply(response, 20); + + assertThat(response.getDateFeatures()).hasSize(1); + assertThat(response.getDateFeatures().get(0).getDateHistogram()) + .containsOnlyKeys("2020-01-01"); + } + + @Test + void apply_dateOutliers_smallGroup_valuesSuppressed() { + // Only 1 date outlier – below minCellCount → must be cleared + DateFeatureStatistics feature = new DateFeatureStatistics( + "dob", 20, 0, 0, + "1900-01-01", "2024-01-01", + new HashMap<>(), new ArrayList<>(List.of("1900-01-01")), + "2000-01-01", 5.0, "2000-01-01", "1990-01-01", "2010-01-01"); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setDateFeatures(List.of(feature)); + + service.apply(response, 20); + + assertThat(response.getDateFeatures()).hasSize(1); + assertThat(response.getDateFeatures().get(0).getOutliers()) + .as("single date outlier must be suppressed").isEmpty(); + } + + @Test + void apply_dateOutliers_largeGroup_valuesPreserved() { + // 2 date outliers – >= minCellCount → must be kept + DateFeatureStatistics feature = new DateFeatureStatistics( + "visit", 20, 0, 0, + "1900-01-01", "2024-01-01", + new HashMap<>(), new ArrayList<>(List.of("1900-01-01", "1901-03-15")), + "2000-01-01", 5.0, "2000-01-01", "1990-01-01", "2010-01-01"); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setDateFeatures(List.of(feature)); + + service.apply(response, 20); + + assertThat(response.getDateFeatures()).hasSize(1); + assertThat(response.getDateFeatures().get(0).getOutliers()) + .as("outlier group >= minCellCount must be preserved") + .containsExactlyInAnyOrder("1900-01-01", "1901-03-15"); + } + + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + + private ContinuousFeatureStatistics continuousFeature(String name, long count) { + return new ContinuousFeatureStatistics( + name, count, 0, 0, (int) count, + 0, 100, 50, 10, 25, 50, 75, + List.of(1.0), List.of("[0-100]"), List.of()); + } + + private CategoricalFeatureStatistics categoricalFeature(String name, Map counts) { + List> sorted = counts.entrySet().stream() + .sorted((a, b) -> b.getValue().compareTo(a.getValue())) + .toList(); + String mode = sorted.get(0).getKey(); + int modeFreq = sorted.get(0).getValue(); + String secondMode = sorted.size() > 1 ? sorted.get(1).getKey() : null; + Integer secondModeFreq = sorted.size() > 1 ? sorted.get(1).getValue() : null; + long total = counts.values().stream().mapToInt(Integer::intValue).sum(); + return new CategoricalFeatureStatistics( + name, total, 0, 0, counts.size(), + mode, modeFreq, (double) modeFreq / total * 100, + secondMode, secondModeFreq, + secondModeFreq != null ? (double) secondModeFreq / total * 100 : null, + counts); + } + + private DateFeatureStatistics dateFeature(String name, long count) { + return new DateFeatureStatistics( + name, count, 0, 0, + "2020-01-01", "2024-01-01", + new HashMap<>(), List.of(), + "2022-01-01", 365.0, "2022-01-01", "2021-01-01", "2023-01-01"); + } + + // ------------------------------------------------------------------------- + // Correlation matrix cleanup for suppressed continuous features + // ------------------------------------------------------------------------- + + @Test + void apply_continuousFeatureSuppressed_removedFromAllCorrelationMatrices() { + // "bp" has count 2, below MIN_SUBSET (3) → suppressed + // "age" has count 10 → kept + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of( + continuousFeature("bp", MIN_SUBSET - 1), + continuousFeature("age", 10))); + + // Simulate correlation matrices pre-populated by AggregateCalculator + Map bpInner = new HashMap<>(Map.of("age", 0.9)); + Map ageInner = new HashMap<>(Map.of("bp", 0.9)); + response.setCovariances(new HashMap<>(Map.of("bp", bpInner, "age", ageInner))); + response.setPearsonCorrelations(new HashMap<>(Map.of("bp", new HashMap<>(Map.of("age", 0.9)), + "age", new HashMap<>(Map.of("bp", 0.9))))); + response.setSpearmanCorrelations(new HashMap<>(Map.of("bp", new HashMap<>(Map.of("age", 0.8)), + "age", new HashMap<>(Map.of("bp", 0.8))))); + + service.apply(response, 20); + + assertThat(response.getContinuousFeatures()) + .extracting(fs -> fs.getFeatureName()).containsExactly("age"); + assertThat(response.getOmittedFeatures()) + .extracting(fs -> fs.getFeatureName()).containsExactly("bp"); + + // "bp" must be gone as outer key + assertThat(response.getCovariances()).doesNotContainKey("bp"); + assertThat(response.getPearsonCorrelations()).doesNotContainKey("bp"); + assertThat(response.getSpearmanCorrelations()).doesNotContainKey("bp"); + + // "bp" must also be removed from "age"'s inner map + assertThat(response.getCovariances().get("age")).doesNotContainKey("bp"); + assertThat(response.getPearsonCorrelations().get("age")).doesNotContainKey("bp"); + assertThat(response.getSpearmanCorrelations().get("age")).doesNotContainKey("bp"); + } + + // ------------------------------------------------------------------------- + // Chi-squared cleanup for fully-suppressed categorical features + // ------------------------------------------------------------------------- + + @Test + void apply_categoricalFeatureFullySuppressed_removedFromChiSquaredResults() { + // "rare" has all cells below MIN_CELL → fully suppressed + // "status" is fine + Map rareCounts = new HashMap<>(Map.of("A", 1)); // 1 < 2 → suppressed + Map statusCounts = new HashMap<>(Map.of("X", 10, "Y", 8)); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of( + categoricalFeature("rare", rareCounts), + categoricalFeature("status", statusCounts))); + + // Simulate chi-squared results pre-populated by AggregateCalculator + List chi = new ArrayList<>(); + chi.add(new org.taniwha.statistics.ChiSquaredTestResult("rare", "status", 0.04)); + chi.add(new org.taniwha.statistics.ChiSquaredTestResult("status", "other", 0.12)); + response.setChiSquareTest(chi); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()) + .extracting(fs -> fs.getFeatureName()).containsExactly("status"); + assertThat(response.getOmittedFeatures()) + .extracting(fs -> fs.getFeatureName()).containsExactly("rare"); + + // The chi-squared entry involving "rare" must be removed + assertThat(response.getChiSquareTest()) + .noneMatch(r -> "rare".equals(r.getCategory1()) || "rare".equals(r.getCategory2())); + // Unrelated chi-squared entries must be preserved + assertThat(response.getChiSquareTest()) + .anySatisfy(r -> { + assertThat(r.getCategory1()).isEqualTo("status"); + assertThat(r.getCategory2()).isEqualTo("other"); + }); + } + + // ------------------------------------------------------------------------- + // Getters + // ------------------------------------------------------------------------- + + @Test + void getMinSubsetSize_returnsConfiguredValue() { + assertThat(service.getMinSubsetSize()).isEqualTo(MIN_SUBSET); + } + + @Test + void getMinCellCount_returnsConfiguredValue() { + assertThat(service.getMinCellCount()).isEqualTo(MIN_CELL); + } + + // ------------------------------------------------------------------------- + // Null-list guards in moveFeaturesToOmitted / moveDateFeaturesToOmitted + // ------------------------------------------------------------------------- + + @Test + void apply_totalRecordsBelowThreshold_nullFeatureLists_doesNotThrow() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(null); + response.setCategoricalFeatures(null); + response.setDateFeatures(null); + + int suppressed = service.apply(response, 2); + + assertThat(suppressed).isZero(); + assertThat(response.getOmittedFeatures()).isEmpty(); + } + + // ------------------------------------------------------------------------- + // rebuildCategoricalFeature with only one surviving category (no secondMode) + // ------------------------------------------------------------------------- + + @Test + void apply_categoricalSingleSurvivorAfterCellSuppression_rebuildWithNoSecondMode() { + // "A"=10 survives, "B"=1 is suppressed → only one category left + Map counts = new HashMap<>(); + counts.put("A", 10); + counts.put("B", 1); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of(categoricalFeature("dx", counts))); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()).hasSize(1); + CategoricalFeatureStatistics result = + (CategoricalFeatureStatistics) response.getCategoricalFeatures().get(0); + assertThat(result.getMode()).isEqualTo("A"); + assertThat(result.getSecondMode()).isNull(); + assertThat(result.getSecondModeFrequency()).isNull(); + assertThat(result.getSecondModePercentage()).isNull(); + } + + @Test + void apply_categoricalTwoSurvivorsAfterCellSuppression_rebuildWithSecondMode() { + // "A"=10, "B"=8 survive; "C"=1 is suppressed → comparator lambda is invoked + Map counts = new HashMap<>(); + counts.put("A", 10); + counts.put("B", 8); + counts.put("C", 1); + + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setCategoricalFeatures(List.of(categoricalFeature("feature", counts))); + + service.apply(response, 20); + + assertThat(response.getCategoricalFeatures()).hasSize(1); + CategoricalFeatureStatistics result = + (CategoricalFeatureStatistics) response.getCategoricalFeatures().get(0); + assertThat(result.getMode()).isEqualTo("A"); + assertThat(result.getSecondMode()).isEqualTo("B"); + assertThat(result.getSecondModeFrequency()).isEqualTo(8); + } + + // ------------------------------------------------------------------------- + // Null-matrix guard in removeFromCorrelationMatrix + // ------------------------------------------------------------------------- + + @Test + void apply_continuousFeatureSuppressed_nullCorrelationMatrix_doesNotThrow() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setContinuousFeatures(List.of(continuousFeature("tiny", MIN_SUBSET - 1))); + response.setCovariances(null); + response.setPearsonCorrelations(null); + response.setSpearmanCorrelations(null); + + int suppressed = service.apply(response, 20); + + assertThat(suppressed).isEqualTo(1); + assertThat(response.getOmittedFeatures()).hasSize(1); + } + + // ------------------------------------------------------------------------- + // safeList null-guard + // ------------------------------------------------------------------------- + + @Test + void apply_nullOmittedFeaturesOnInput_treatedAsEmpty() { + AnalyticsResponseDTO response = new AnalyticsResponseDTO(); + response.setOmittedFeatures(null); + response.setContinuousFeatures(List.of(continuousFeature("f", MIN_SUBSET - 1))); + + // should not throw; omittedFeatures must be populated after suppression + service.apply(response, 20); + + assertThat(response.getOmittedFeatures()).hasSize(1); + } +} diff --git a/src/test/java/org/taniwha/service/FileServiceTest.java b/src/test/java/org/taniwha/service/FileServiceTest.java index e49c69b..458f9fa 100644 --- a/src/test/java/org/taniwha/service/FileServiceTest.java +++ b/src/test/java/org/taniwha/service/FileServiceTest.java @@ -5,8 +5,11 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.MockitoAnnotations; +import org.taniwha.dto.FileInfoDto; +import org.taniwha.exception.InvalidFileException; import org.taniwha.model.Dataset; import org.taniwha.model.Distribution; +import org.taniwha.model.FileCategory; import org.taniwha.model.NodeMetadata; import org.taniwha.security.FileFilter; @@ -18,6 +21,7 @@ import java.util.List; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doNothing; @@ -265,4 +269,264 @@ void mappedDatasetsFolder_isSameAsDatasetsFolder() { assertThat(foo).endsWith("/datasets"); assertThat(fileService.listMappedDatasetFiles()).isEmpty(); } + + // ------------------------------------------------------------------------- + // resolveXxxFilePath – just verify they delegate to the correct sub-folder + // ------------------------------------------------------------------------- + + @Test + void resolveDatasetFilePath_existingFile_returnsPath() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Path file = ds.resolve("x.csv"); + Files.createFile(file); + + Path result = fileService.resolveDatasetFilePath("x.csv"); + assertThat(result).isEqualTo(file); + } + + @Test + void resolveElementFilePath_existingFile_returnsPath() throws IOException { + Path el = tempBase.resolve("dataset_elements"); + Files.createDirectories(el); + Path file = el.resolve("e.csv"); + Files.createFile(file); + + Path result = fileService.resolveElementFilePath("e.csv"); + assertThat(result).isEqualTo(file); + } + + @Test + void resolveMappedDatasetFilePath_existingFile_returnsPath() throws IOException { + // mapped datasets reuse the datasets folder + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Path file = ds.resolve("m.csv"); + Files.createFile(file); + + Path result = fileService.resolveMappedDatasetFilePath("m.csv"); + assertThat(result).isEqualTo(file); + } + + @Test + void resolveMetadataFilePath_existingFile_returnsPath() throws IOException { + Path md = tempBase.resolve("dataset_metadata"); + Files.createDirectories(md); + Path file = md.resolve("meta.ttl"); + Files.createFile(file); + + Path result = fileService.resolveMetadataFilePath("meta.ttl"); + assertThat(result).isEqualTo(file); + } + + @Test + void resolveExistingFilePath_nonExistentFile_throws() { + assertThatThrownBy(() -> fileService.resolveDatasetFilePath("no.csv")) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void resolveExistingFilePath_pathTraversal_throws() throws IOException { + assertThatThrownBy(() -> fileService.resolveDatasetFilePath("../secret.txt")) + .isInstanceOf(IllegalArgumentException.class); + } + + // ------------------------------------------------------------------------- + // listFilesWithInfo + // ------------------------------------------------------------------------- + + @Test + void listFilesWithInfo_nonExistentDir_returnsEmpty() { + assertThat(fileService.listFilesWithInfo(FileCategory.FHIR_MAPPINGS)).isEmpty(); + } + + @Test + void listFilesWithInfo_returnsMetaForFiles() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.writeString(ds.resolve("a.csv"), "data"); + Files.writeString(ds.resolve("b.csv"), "more"); + + List infos = fileService.listFilesWithInfo(FileCategory.DATASETS); + + assertThat(infos) + .hasSize(2) + .extracting(org.taniwha.dto.FileInfoDto::getName) + .containsExactly("a.csv", "b.csv"); // sorted alphabetically + assertThat(infos.get(0).getSizeBytes()).isEqualTo(4L); // "data" + } + + @Test + void listFilesWithInfo_fhirMappingsCategory_returnsFiles() throws IOException { + Path fm = tempBase.resolve("fhir_mappings"); + Files.createDirectories(fm); + Files.writeString(fm.resolve("m.json"), "{}"); + + assertThat(fileService.listFilesWithInfo(FileCategory.FHIR_MAPPINGS)).hasSize(1); + } + + @Test + void listFilesWithInfo_datasetElementsAndMetadataCategories() throws IOException { + Path el = tempBase.resolve("dataset_elements"); + Files.createDirectories(el); + Files.createFile(el.resolve("e.csv")); + + Path md = tempBase.resolve("dataset_metadata"); + Files.createDirectories(md); + Files.createFile(md.resolve("meta.ttl")); + + assertThat(fileService.listFilesWithInfo(FileCategory.DATASET_ELEMENTS)).hasSize(1); + assertThat(fileService.listFilesWithInfo(FileCategory.DATASET_METADATA)).hasSize(1); + } + + @Test + void listFilesWithInfo_skipsSubdirectories() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds.resolve("subdir")); + Files.createFile(ds.resolve("only.csv")); + + List infos = fileService.listFilesWithInfo(FileCategory.DATASETS); + assertThat(infos).hasSize(1).extracting(org.taniwha.dto.FileInfoDto::getName).containsExactly("only.csv"); + } + + // ------------------------------------------------------------------------- + // renameFile + // ------------------------------------------------------------------------- + + @Test + void renameFile_existingFile_renames() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.createFile(ds.resolve("old.csv")); + + fileService.renameFile(FileCategory.DATASETS, "old.csv", "new.csv"); + + assertThat(ds.resolve("new.csv")).exists(); + assertThat(ds.resolve("old.csv")).doesNotExist(); + } + + @Test + void renameFile_sameSourceAndDestination_isNoOp() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.createFile(ds.resolve("same.csv")); + + // sanitizeFileName("same.csv") == "same.csv" -> no-op + fileService.renameFile(FileCategory.DATASETS, "same.csv", "same.csv"); + + assertThat(ds.resolve("same.csv")).exists(); + } + + @Test + void renameFile_destinationAlreadyExists_throws() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.createFile(ds.resolve("a.csv")); + Files.createFile(ds.resolve("b.csv")); + + assertThatThrownBy(() -> fileService.renameFile(FileCategory.DATASETS, "a.csv", "b.csv")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Destination already exists"); + } + + @Test + void renameFile_emptyDestination_throws() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.createFile(ds.resolve("src.csv")); + + assertThatThrownBy(() -> fileService.renameFile(FileCategory.DATASETS, "src.csv", "")) + .isInstanceOf(IllegalArgumentException.class); + } + + // ------------------------------------------------------------------------- + // deleteFile + // ------------------------------------------------------------------------- + + @Test + void deleteFile_existingFile_deletes() throws IOException { + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Files.createFile(ds.resolve("del.csv")); + + fileService.deleteFile(FileCategory.DATASETS, "del.csv"); + + assertThat(ds.resolve("del.csv")).doesNotExist(); + } + + @Test + void deleteFile_nonExistentFile_isNoOp() { + // Should not throw even if file does not exist + assertThatCode(() -> fileService.deleteFile(FileCategory.DATASETS, "ghost.csv")) + .doesNotThrowAnyException(); + } + + @Test + void deleteFile_pathTraversal_throws() { + assertThatThrownBy(() -> fileService.deleteFile(FileCategory.DATASETS, "../secret.txt")) + .isInstanceOf(IllegalArgumentException.class); + } + + // ------------------------------------------------------------------------- + // listFilesWithInfo – MAPPED_DATASETS and DATASET_METADATA categories + // ------------------------------------------------------------------------- + + @Test + void listFilesWithInfo_mappedDatasetsCategory_returnsFiles() throws IOException { + // MAPPED_DATASETS maps to the same dir as DATASETS (mappedDatasetsDir = datasetsDir) + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Path f = ds.resolve("result.csv"); + Files.writeString(f, "col\nval\n"); + + List result = fileService.listFilesWithInfo(FileCategory.MAPPED_DATASETS); + assertThat(result).extracting(FileInfoDto::getName).contains("result.csv"); + } + + @Test + void listFilesWithInfo_metadataCategoryNonExistentDir_returnsEmpty() { + // DATASET_METADATA directory might not exist → should return empty list + List result = fileService.listFilesWithInfo(FileCategory.DATASET_METADATA); + assertThat(result).isEmpty(); + } + + @Test + void listFilesWithInfo_invalidFileIsSkipped() throws IOException { + // Put an invalid file; FileFilter throws on it → should be skipped gracefully + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + Path bad = ds.resolve("bad.exe"); + Files.writeString(bad, "binary"); + + org.mockito.Mockito.doThrow(new org.taniwha.exception.InvalidFileException("bad ext")) + .when(fileFilter).validate(bad); + + List result = fileService.listFilesWithInfo(FileCategory.DATASETS); + assertThat(result.stream().map(FileInfoDto::getName)).doesNotContain("bad.exe"); + } + + // ------------------------------------------------------------------------- + // resolveSafePath – backslash traversal guard + // ------------------------------------------------------------------------- + + @Test + void resolveExistingFilePath_backslashInName_throws() { + // backslash is treated as path separator → triggers "Invalid file name" + assertThatThrownBy(() -> fileService.resolveExistingFilePath(FileCategory.DATASETS, "..\\secret.csv")) + .isInstanceOf(IllegalArgumentException.class); + } + + // ------------------------------------------------------------------------- + // deleteFile – validation failure is ignored (validate is called but file exists) + // ------------------------------------------------------------------------- + + @Test + void deleteFile_nonExistentFileAfterResolve_isNoOp() throws IOException { + // Ensure file doesn't exist; deleteFile should silently return + Path ds = tempBase.resolve("datasets"); + Files.createDirectories(ds); + // no file created – should just return + fileService.deleteFile(FileCategory.DATASETS, "ghost2.csv"); + // no exception expected + } } diff --git a/src/test/java/org/taniwha/service/HarmonizerServiceTest.java b/src/test/java/org/taniwha/service/HarmonizerServiceTest.java index c6c22a1..c9ba5cf 100644 --- a/src/test/java/org/taniwha/service/HarmonizerServiceTest.java +++ b/src/test/java/org/taniwha/service/HarmonizerServiceTest.java @@ -21,6 +21,7 @@ class HarmonizerServiceTest { private HarmonizerService harmonizerService; + private HarmonizationProcessingJobs jobs; @TempDir Path baseDir; @@ -35,7 +36,7 @@ void setUp() { CleaningProcessingJobs cleaningJobs = new CleaningProcessingJobs(); DataCleaningService dataCleaningService = new DataCleaningService(fileService, dataProcessingService, cleaningJobs); - HarmonizationProcessingJobs jobs = new HarmonizationProcessingJobs(); + jobs = new HarmonizationProcessingJobs(); harmonizerService = new HarmonizerService( dataProcessingService, @@ -272,4 +273,475 @@ void parseFiles_withCustomOneHotAndRangeMapping_combinesCorrectly() throws Excep assertThat(rows.get(2)).isEqualTo("42;55;2025-07-12T11:00:00Z;1;"); assertThat(rows.get(3)).isEqualTo("70;30;2025-07-13T12:00:00Z;0;"); } -} \ No newline at end of file + + // ------------------------------------------------------------------------- + // matchesDeclaredType – via type-marker string value in config + // ------------------------------------------------------------------------- + + @Test + void parseFiles_typeMarker_integer_passesThroughIntegerValues() throws Exception { + // In custom-only mode (no regular config for the dataset key), seeding does NOT happen. + // Type marker "integer" passes through whole-number values and rejects decimals/non-numerics. + makeCsv("nums.csv", "score\n42\n3.14\nnotanum\n"); + + String configs = """ + [ + { + "custom_mapping": { + "fileName":"custom_mapping", + "mappingType":"default", + "columns":["score"], + "groups":[ + { + "values":[ + { + "name":"INT_SCORE", + "mapping":[ + { "groupColumn":"score", "value":"integer" } + ] + } + ] + } + ] + } + } + ] + """; + + // Key "noconfig" is not in configs → custom-only mode (no seeding) + harmonizerService.parseFiles(configs, Map.of("noconfig", List.of("nums.csv")), null); + + Path out = baseDir.resolve("datasets").resolve("parsed_nums.csv"); + List rows = Files.readAllLines(out); + + // Row with 42 (integer) → raw value passthrough "42" + assertThat(rows.get(1)).isEqualTo("42"); + // Row with 3.14 (decimal, not an integer) → empty + assertThat(rows.get(2)).isEqualTo(""); + // Row with "notanum" (non-numeric) → empty + assertThat(rows.get(3)).isEqualTo(""); + } + + @Test + void parseFiles_typeMarker_double_passesThroughNumericValues() throws Exception { + makeCsv("floats.csv", "val\n1.5\nword\n"); + + String configs = """ + [ + { + "custom_mapping": { + "fileName":"custom_mapping", + "mappingType":"default", + "columns":["val"], + "groups":[ + { + "values":[ + { + "name":"DBL_VAL", + "mapping":[ + { "groupColumn":"val", "value":"double" } + ] + } + ] + } + ] + } + } + ] + """; + + // Use a key that's NOT in configs → custom-only mode (no seeding) + harmonizerService.parseFiles(configs, Map.of("noconfig", List.of("floats.csv")), null); + + Path out = baseDir.resolve("datasets").resolve("parsed_floats.csv"); + List rows = Files.readAllLines(out); + + assertThat(rows.get(1)).isEqualTo("1.5"); // numeric → passthrough + assertThat(rows.get(2)).isEqualTo(""); // non-numeric → empty + } + + @Test + void parseFiles_typeMarker_date_passesThroughDateValues() throws Exception { + makeCsv("dates.csv", "dt\n2024-03-01\nnotadate\n"); + + String configs = """ + [ + { + "custom_mapping": { + "fileName":"custom_mapping", + "mappingType":"default", + "columns":["dt"], + "groups":[ + { + "values":[ + { + "name":"DATE_VAL", + "mapping":[ + { "groupColumn":"dt", "value":"date" } + ] + } + ] + } + ] + } + } + ] + """; + + // Use a key that's NOT in configs → custom-only mode + harmonizerService.parseFiles(configs, Map.of("noconfig", List.of("dates.csv")), null); + + Path out = baseDir.resolve("datasets").resolve("parsed_dates.csv"); + List rows = Files.readAllLines(out); + + assertThat(rows.get(1)).isEqualTo("2024-03-01"); // date → passthrough + assertThat(rows.get(2)).isEqualTo(""); // non-date → empty + } + + // ------------------------------------------------------------------------- + // matchRangeOrDate – date range type + // ------------------------------------------------------------------------- + + @Test + void parseFiles_dateRangeMapping_includesInRangeRows() throws Exception { + makeCsv("events.csv", "dt\n2023-03-10\n2022-06-15\n2023-11-01\n"); + + String configs = """ + [ + { + "custom_mapping": { + "fileName":"custom_mapping", + "mappingType":"default", + "columns":["dt"], + "groups":[ + { + "values":[ + { + "name":"IN_RANGE", + "mapping":[ + { + "groupColumn":"dt", + "value": { "type":"date", "minValue":"2023-01-01", "maxValue":"2023-12-31" } + } + ] + } + ] + } + ] + } + } + ] + """; + + harmonizerService.parseFiles(configs, Map.of("events", List.of("events.csv")), null); + + Path out = baseDir.resolve("datasets").resolve("parsed_events.csv"); + List rows = Files.readAllLines(out); + + // header + assertThat(rows.get(0)).isEqualTo("custom_mapping"); + // 2023-03-10 is in range → IN_RANGE + assertThat(rows.get(1)).isEqualTo("IN_RANGE"); + // 2022-06-15 out of range → empty + assertThat(rows.get(2)).isEqualTo(""); + // 2023-11-01 is in range → IN_RANGE + assertThat(rows.get(3)).isEqualTo("IN_RANGE"); + } + + // ------------------------------------------------------------------------- + // shouldStandardizeNumeric – branches + // ------------------------------------------------------------------------- + + @Test + void parseFiles_withStandardizeNumeric_normalizesValues() throws Exception { + // Tests shouldStandardizeNumeric returning true → calls standardizeNumericInPlace + makeCsv("nums2.csv", "price\n10.5\n20.0\n"); + + String configs = """ + [ + { + "cfg_num": { + "fileName":"cfg_num", + "columns":["price"], + "groups":[ + { "column":"price", "values":[] } + ] + } + } + ] + """; + + DataCleaningOptionsDTO cleanOpts = new DataCleaningOptionsDTO(); + cleanOpts.setStandardizeNumeric(true); + cleanOpts.setNumericMode("double"); + cleanOpts.setNumericColumns(List.of("cfg_num:::price")); + + harmonizerService.parseFiles(configs, Map.of("cfg_num", List.of("nums2.csv")), cleanOpts); + + Path out = baseDir.resolve("datasets").resolve("parsed_nums2.csv"); + assertThat(out).exists(); + List rows = Files.readAllLines(out); + // rows[1] and rows[2] contain the price values (standardized to double format) + assertThat(rows).hasSizeGreaterThan(1); + } + + @Test + void parseFiles_withStandardizeNumeric_falseFlag_noNormalization() throws Exception { + makeCsv("nums3.csv", "price\n10.5\n"); + + String configs = """ + [ + { + "cfg_num2": { + "fileName":"cfg_num2", + "columns":["price"], + "groups":[ + { "column":"price", "values":[] } + ] + } + } + ] + """; + + DataCleaningOptionsDTO cleanOpts = new DataCleaningOptionsDTO(); + cleanOpts.setStandardizeNumeric(false); // ← key: shouldStandardizeNumeric returns false + + // Should still process without error + String msg = harmonizerService.parseFiles(configs, Map.of("cfg_num2", List.of("nums3.csv")), cleanOpts); + assertThat(msg).isEqualTo("Files processed successfully."); + } + + @Test + void parseFiles_withStandardizeNumeric_emptyColumns_noNormalization() throws Exception { + makeCsv("nums4.csv", "price\n10.5\n"); + + String configs = """ + [ + { + "cfg_num3": { + "fileName":"cfg_num3", + "columns":["price"], + "groups":[ + { "column":"price", "values":[] } + ] + } + } + ] + """; + + DataCleaningOptionsDTO cleanOpts = new DataCleaningOptionsDTO(); + cleanOpts.setStandardizeNumeric(true); + cleanOpts.setNumericMode("double"); + cleanOpts.setNumericColumns(List.of()); // ← empty → shouldStandardizeNumeric returns false + + String msg = harmonizerService.parseFiles(configs, Map.of("cfg_num3", List.of("nums4.csv")), cleanOpts); + assertThat(msg).isEqualTo("Files processed successfully."); + } + + // ========================================================================= + // parseFilesWithProgress – covers the async path and lambda chains + // ========================================================================= + + @Test + void parseFilesWithProgress_emptyConfig_completesSuccessfully() throws Exception { + makeCsv("prog1.csv", "col1;col2\nA;1\nB;2\n"); + + String configs = "[]"; + Map> mappings = Map.of("prog1.csv", List.of("prog1.csv")); + + String result = harmonizerService.parseFilesWithProgress("job-p1", configs, mappings, null); + assertThat(result).contains("successfully"); + } + + @Test + void parseFilesWithProgress_withSimpleConfig_writesOutputFile() throws Exception { + makeCsv("prog2.csv", "color;size\nred;10\nblue;20\n"); + + String configs = """ + [ + { + "prog2.csv": { + "fileName":"prog2.csv", + "columns":["color"], + "groups":[ + { + "column":"color", + "values":[ + {"value":"red", "mapping":[{"name":"COLOR_RED", "type":"binary"}]}, + {"value":"blue", "mapping":[{"name":"COLOR_BLUE", "type":"binary"}]} + ] + } + ] + } + } + ] + """; + + Map> mappings = Map.of("prog2.csv", List.of("prog2.csv")); + + String result = harmonizerService.parseFilesWithProgress("job-p2", configs, mappings, null); + assertThat(result).contains("successfully"); + } + + @Test + void parseFilesWithProgress_missingFile_skipsAndCompletes() throws Exception { + // Do NOT create the file → should skip with a warning and complete + String configs = "[]"; + Map> mappings = Map.of("missing.csv", List.of("missing.csv")); + + String result = harmonizerService.parseFilesWithProgress("job-p3", configs, mappings, null); + assertThat(result).contains("successfully"); + } + + @Test + void parseFilesWithProgress_withCleaningOpts_appliesCleaningAndCompletes() throws Exception { + makeCsv("prog4.csv", "name;score\n Alice ;10\n Alice ;20\n"); + + String configs = "[]"; + Map> mappings = Map.of("prog4.csv", List.of("prog4.csv")); + + DataCleaningOptionsDTO cleanOpts = new DataCleaningOptionsDTO(); + cleanOpts.setTrimWhitespace(true); + cleanOpts.setRemoveDuplicates(false); + + String result = harmonizerService.parseFilesWithProgress("job-p4", configs, mappings, cleanOpts); + assertThat(result).contains("successfully"); + } + + @Test + void parseFilesWithProgress_emptyMappings_completesSuccessfully() throws Exception { + String configs = "[]"; + Map> mappings = Map.of(); + + String result = harmonizerService.parseFilesWithProgress("job-p5", configs, mappings, null); + assertThat(result).contains("successfully"); + } + + @Test + void parseFilesWithProgress_withCustomOneHotAndDefaultMappings_writesMappedDataset() throws Exception { + makeCsv("d_prog.csv", """ + age;score;when + 25;88;2025-07-11T10:00:00Z + 42;55;2025-07-12T11:00:00Z + """); + + String configs = """ + [ + { + "cfg_prog": { + "fileName":"cfg_prog", + "columns":["age","score","when"], + "groups":[ + { "column":"age", "values":[] }, + { "column":"score", "values":[] }, + { "column":"when", "values":[] } + ] + } + }, + { + "age_group": { + "fileName":"custom_mapping", + "mappingType":"one-hot", + "columns":["age"], + "groups":[ + { + "values":[ + { + "name":"MID", + "mapping":[ + { + "groupColumn":"age", + "value": { "type":"integer", "minValue":40, "maxValue":60 } + } + ] + } + ] + } + ] + } + }, + { + "high_score": { + "fileName":"custom_mapping", + "mappingType":"default", + "columns":["score"], + "groups":[ + { + "values":[ + { + "name":"TOP", + "mapping":[ + { "groupColumn":"score", "value":"88" } + ] + } + ] + } + ] + } + } + ] + """; + + String msg = harmonizerService.parseFilesWithProgress( + "job-p6", + configs, + Map.of("cfg_prog", List.of("d_prog.csv")), + null + ); + + assertThat(msg).isEqualTo("Files processed successfully."); + Path out = baseDir.resolve("datasets").resolve("parsed_d_prog.csv"); + assertThat(out).exists(); + List rows = Files.readAllLines(out); + assertThat(rows.get(0)).isEqualTo("age;score;when;age_group;high_score"); + assertThat(rows).contains("25;88;2025-07-11T10:00:00Z;0;TOP"); + assertThat(rows).contains("42;55;2025-07-12T11:00:00Z;1;"); + } + + @Test + void startParseJob_success_updatesJobToDone() throws Exception { + makeCsv("async_ok.csv", "a;b\n1;2\n"); + String jobId = jobs.createJob(); + + harmonizerService.startParseJob( + jobId, + "[]", + Map.of("async_ok.csv", List.of("async_ok.csv")), + null + ); + + for (int i = 0; i < 60; i++) { + var st = jobs.getJob(jobId); + if (st != null && st.getState() != org.taniwha.dto.HarmonizationStatusDTO.State.RUNNING) break; + Thread.sleep(50); + } + + var state = jobs.getJob(jobId); + assertThat(state).isNotNull(); + assertThat(state.getState()).isEqualTo(org.taniwha.dto.HarmonizationStatusDTO.State.DONE); + assertThat(state.getPercent().get()).isEqualTo(100); + } + + @Test + void startParseJob_invalidJson_updatesJobToError() throws Exception { + String jobId = jobs.createJob(); + + harmonizerService.startParseJob( + jobId, + "{not-json}", + Map.of("bad", List.of("missing.csv")), + null + ); + + for (int i = 0; i < 60; i++) { + var st = jobs.getJob(jobId); + if (st != null && st.getState() == org.taniwha.dto.HarmonizationStatusDTO.State.ERROR) break; + Thread.sleep(50); + } + + var state = jobs.getJob(jobId); + assertThat(state).isNotNull(); + assertThat(state.getState()).isEqualTo(org.taniwha.dto.HarmonizationStatusDTO.State.ERROR); + assertThat(state.getMessage()).contains("Error processing files"); + } +} diff --git a/src/test/java/org/taniwha/service/NodeAccessServiceTest.java b/src/test/java/org/taniwha/service/NodeAccessServiceTest.java index 33c4706..aedb924 100644 --- a/src/test/java/org/taniwha/service/NodeAccessServiceTest.java +++ b/src/test/java/org/taniwha/service/NodeAccessServiceTest.java @@ -186,4 +186,119 @@ void validateUserToken_withStoredAndValid_returnsTrue() throws Exception { assertThat(spy.validateUserToken("jwt", "sgt")).isTrue(); } + + // ------------------------------------------------------------------------- + // isTicketValid – additional guard branches (via verifySgtTicket wrapper) + // ------------------------------------------------------------------------- + + /** + * Helper: build a SgtTicket mock that passes version/realm/sname/key checks, + * so we can test the time-based branches in isolation. + */ + private SgtTicket fullValidTicketBase() { + SgtTicket sgt = mock(SgtTicket.class); + Ticket t = mock(Ticket.class); + EncKdcRepPart part = mock(EncKdcRepPart.class); + PrincipalName pn = mock(PrincipalName.class); + + when(pn.getName()).thenReturn("HTTP/host.domain"); + when(sgt.getTicket()).thenReturn(t); + when(sgt.getEncKdcRepPart()).thenReturn(part); + when(t.getTktvno()).thenReturn(5); + when(t.getRealm()).thenReturn("REALM"); + when(t.getSname()).thenReturn(pn); + + EncryptionKey key = mock(EncryptionKey.class); + when(part.getKey()).thenReturn(key); + when(part.getFlags()).thenReturn(new org.apache.kerby.kerberos.kerb.type.ticket.TicketFlags()); + when(part.getAuthTime()).thenReturn(null); + when(part.getEndTime()).thenReturn(null); + when(part.getRenewTill()).thenReturn(null); + + return sgt; + } + + @Test + void verifySgtTicket_nullKeytab_returnsFalse() throws Exception { + SgtTicket sgt = fullValidTicketBase(); + when(principalService.getKeytab()).thenReturn(null); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void verifySgtTicket_futureAuthTime_returnsFalse() { + SgtTicket sgt = fullValidTicketBase(); + + // Set authTime to the far future + org.apache.kerby.kerberos.kerb.type.KerberosTime futureTime = + new org.apache.kerby.kerberos.kerb.type.KerberosTime( + System.currentTimeMillis() + 3_600_000L); // 1 hour ahead + when(sgt.getEncKdcRepPart().getAuthTime()).thenReturn(futureTime); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void verifySgtTicket_expiredEndTime_returnsFalse() { + SgtTicket sgt = fullValidTicketBase(); + + // Set endTime in the past + org.apache.kerby.kerberos.kerb.type.KerberosTime pastTime = + new org.apache.kerby.kerberos.kerb.type.KerberosTime( + System.currentTimeMillis() - 3_600_000L); // 1 hour ago + when(sgt.getEncKdcRepPart().getEndTime()).thenReturn(pastTime); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void verifySgtTicket_expiredRenewTill_returnsFalse() { + SgtTicket sgt = fullValidTicketBase(); + + // Set renewTill in the past + org.apache.kerby.kerberos.kerb.type.KerberosTime pastTime = + new org.apache.kerby.kerberos.kerb.type.KerberosTime( + System.currentTimeMillis() - 3_600_000L); + when(sgt.getEncKdcRepPart().getRenewTill()).thenReturn(pastTime); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void verifySgtTicket_nullFlags_returnsFalse() { + SgtTicket sgt = fullValidTicketBase(); + when(sgt.getEncKdcRepPart().getFlags()).thenReturn(null); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void verifySgtTicket_krbExceptionDuringDecrypt_returnsFalse() throws Exception { + SgtTicket sgt = fullValidTicketBase(); + + EncryptionKey key = sgt.getEncKdcRepPart().getKey(); + when(key.getKeyData()).thenReturn(new byte[]{1, 2, 3}); + when(key.getKeyType()).thenReturn(EncryptionType.AES128_CTS_HMAC_SHA1_96); + when(principalService.getKeytab()).thenReturn(new byte[]{0}); + when(krbService.loadKeyFromKeytab(anyString(), any(), any())) + .thenThrow(new org.apache.kerby.kerberos.kerb.KrbException("krb error")); + + assertThat(service.verifySgtTicket(sgt)).isFalse(); + } + + @Test + void validateUserToken_decodeError_returnsFalse() throws Exception { + when(jwtUtil.getUsernameFromToken("jwt")).thenReturn("user"); + when(krbService.decodeSgtTicket("bad")).thenThrow(new IOException("decode failed")); + + assertThat(service.validateUserToken("jwt", "bad")).isFalse(); + } + + @Test + void getHostName_invalidUri_returnsOriginal() { + // A string that is not a valid URI but won't throw (uses raw string) + String result = service.getHostName("not a uri with spaces"); + assertThat(result).isEqualTo("not a uri with spaces"); + } } diff --git a/src/test/java/org/taniwha/service/NodeSyncServiceTest.java b/src/test/java/org/taniwha/service/NodeSyncServiceTest.java index 57a0465..9338321 100644 --- a/src/test/java/org/taniwha/service/NodeSyncServiceTest.java +++ b/src/test/java/org/taniwha/service/NodeSyncServiceTest.java @@ -176,4 +176,160 @@ void logErrorToCentralBackend_sslHandshake_refreshesAndPostsTwice() { verify(restTemplate, times(2)) .postForEntity(anyString(), any(), eq(String.class)); } + + // ------------------------------------------------------------------------- + // sendHeartbeat – SSL handshake + generic RestClientException + // ------------------------------------------------------------------------- + + @Test + void sendHeartbeat_sslHandshake_refreshesAndRetries() { + org.springframework.test.util.ReflectionTestUtils.setField(service, "objectId", "OID"); + when(jwtUtil.generateToken("user")).thenReturn("JWT-SSL"); + ResponseEntity ok = ResponseEntity.ok("ok"); + + when(restTemplate.postForEntity(anyString(), any(), eq(String.class))) + .thenThrow(new ResourceAccessException("io", new SSLHandshakeException("ssl"))) + .thenReturn(ok); + stubRetryTemplate(); + + service.sendHeartbeat(); + + verify(restHolder).refresh(); + verify(restTemplate, times(2)) + .postForEntity(anyString(), any(), eq(String.class)); + } + + @Test + void sendHeartbeat_restClientException_logsAndDoesNotThrow() { + org.springframework.test.util.ReflectionTestUtils.setField(service, "objectId", "OID"); + when(jwtUtil.generateToken("user")).thenReturn("JWT-RCE"); + when(retryTemplate.execute(any())) + .thenThrow(new org.springframework.web.client.RestClientException("network down")); + + // should not throw + service.sendHeartbeat(); + } + + // ------------------------------------------------------------------------- + // handleHeartbeatException – via sendHeartbeat (HttpClientErrorException) + // ------------------------------------------------------------------------- + + @Test + void sendHeartbeat_http400_logsErrorAndSendsErrorLog() { + org.springframework.test.util.ReflectionTestUtils.setField(service, "objectId", "OID"); + when(jwtUtil.generateToken("user")).thenReturn("JWT-400"); + + stubRetryTemplate_heartbeatThrowsThenErrorLogSucceeds( + HttpClientErrorException.create( + HttpStatus.BAD_REQUEST, "Bad Request", + org.springframework.http.HttpHeaders.EMPTY, + "bad body".getBytes(), java.nio.charset.StandardCharsets.UTF_8)); + + service.sendHeartbeat(); + + verify(restTemplate).postForEntity( + eq("http://central/api/error"), any(HttpEntity.class), eq(String.class)); + } + + @Test + void sendHeartbeat_http401_logsErrorAndSendsErrorLog() { + org.springframework.test.util.ReflectionTestUtils.setField(service, "objectId", "OID"); + when(jwtUtil.generateToken("user")).thenReturn("JWT-401"); + + stubRetryTemplate_heartbeatThrowsThenErrorLogSucceeds( + HttpClientErrorException.create( + HttpStatus.UNAUTHORIZED, "Unauthorized", + org.springframework.http.HttpHeaders.EMPTY, + "".getBytes(), java.nio.charset.StandardCharsets.UTF_8)); + + service.sendHeartbeat(); + + verify(restTemplate).postForEntity( + eq("http://central/api/error"), any(HttpEntity.class), eq(String.class)); + } + + @Test + void sendHeartbeat_http500_logsErrorAndSendsErrorLog() { + org.springframework.test.util.ReflectionTestUtils.setField(service, "objectId", "OID"); + when(jwtUtil.generateToken("user")).thenReturn("JWT-500"); + + stubRetryTemplate_heartbeatThrowsThenErrorLogSucceeds( + HttpClientErrorException.create( + HttpStatus.INTERNAL_SERVER_ERROR, "Server Error", + org.springframework.http.HttpHeaders.EMPTY, + "".getBytes(), java.nio.charset.StandardCharsets.UTF_8)); + + service.sendHeartbeat(); + + verify(restTemplate).postForEntity( + eq("http://central/api/error"), any(HttpEntity.class), eq(String.class)); + } + + // ------------------------------------------------------------------------- + // registerWithCentralBackend – non-conflict HttpClientErrorException + // ------------------------------------------------------------------------- + + @Test + void registerWithCentralBackend_nonConflictHttpError_doesNotSetKeytab() { + when(jwtUtil.generateToken("user")).thenReturn("JWT-NF"); + when(restTemplate.postForObject(anyString(), any(), eq(RegisterResponseDTO.class))) + .thenThrow(new HttpClientErrorException(HttpStatus.NOT_FOUND)); + + stubRetryTemplate(); + + service.registerWithCentralBackend(); + + verify(principalService, never()).setKeytab(any()); + } + + // ------------------------------------------------------------------------- + // logErrorToCentralBackend – non-SSL ResourceAccessException + // ------------------------------------------------------------------------- + + @Test + void logErrorToCentralBackend_nonSslResourceAccess_rethrowsViaRetry() { + when(jwtUtil.generateToken("user")).thenReturn("JWT-RE"); + when(retryTemplate.execute(any())) + .thenThrow(new ResourceAccessException("network timeout")); + + // should not throw – retry exhausted and RestClientException is caught + service.logErrorToCentralBackend("e", "i"); + } + + // ------------------------------------------------------------------------- + // registerWithCentralBackend – CONFLICT path + // ------------------------------------------------------------------------- + + @Test + void registerWithCentralBackend_conflict_logsAndDoesNotSetKeytab() { + when(jwtUtil.generateToken("user")).thenReturn("JWT-CONFLICT"); + when(retryTemplate.execute(any())) + .thenThrow(new HttpClientErrorException(HttpStatus.CONFLICT)); + + service.registerWithCentralBackend(); + + verify(principalService, never()).setKeytab(any()); + } + + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + + /** + * Stubs the retry template so the first call (heartbeat) throws the given + * exception, and the second call (from logErrorToCentralBackend inside + * handleHeartbeatException) executes the callback normally. + */ + @SuppressWarnings("unchecked") + private void stubRetryTemplate_heartbeatThrowsThenErrorLogSucceeds( + HttpClientErrorException heartbeatException) { + when(retryTemplate.execute(any(RetryCallback.class))) + .thenThrow(heartbeatException) + .thenAnswer(inv -> { + RetryCallback cb = inv.getArgument(0); + RetryContext ctx = mock(RetryContext.class); + when(ctx.getRetryCount()).thenReturn(0); + return cb.doWithRetry(ctx); + }); + } } diff --git a/src/test/java/org/taniwha/service/jobs/AnalyticsProcessingJobsTest.java b/src/test/java/org/taniwha/service/jobs/AnalyticsProcessingJobsTest.java index be6373f..ea0c0bd 100644 --- a/src/test/java/org/taniwha/service/jobs/AnalyticsProcessingJobsTest.java +++ b/src/test/java/org/taniwha/service/jobs/AnalyticsProcessingJobsTest.java @@ -9,6 +9,7 @@ import java.util.List; import static org.assertj.core.api.Assertions.*; +import static org.mockito.Mockito.*; class AnalyticsProcessingJobsTest { @@ -182,4 +183,146 @@ void concurrentAccess_shouldBeSafe() throws InterruptedException { assertThat(state).isNotNull(); assertThat(state.getPercent().get()).isBetween(0, 100); } + + // ------------------------------------------------------------------------- + // cancel + isCanceled + // ------------------------------------------------------------------------- + + @Test + void cancel_runningJob_withFuture_cancelsItAndSetsState() { + String jobId = jobs.createJob(); + java.util.concurrent.Future future = mock(java.util.concurrent.Future.class); + jobs.attachFuture(jobId, future); + + jobs.cancel(jobId, "user canceled"); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.CANCELED); + assertThat(jobs.getJob(jobId).getMessage()).isEqualTo("user canceled"); + verify(future).cancel(true); + } + + @Test + void cancel_runningJob_noFuture_setsStateWithoutThrow() { + String jobId = jobs.createJob(); + + jobs.cancel(jobId, "no future"); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.CANCELED); + assertThat(jobs.getJob(jobId).getMessage()).isEqualTo("no future"); + } + + @Test + void cancel_nullMessage_usesDefaultMessage() { + String jobId = jobs.createJob(); + + jobs.cancel(jobId, null); + + assertThat(jobs.getJob(jobId).getMessage()).isEqualTo("Job canceled"); + } + + @Test + void cancel_alreadyDone_isNoOp() { + String jobId = jobs.createJob(); + jobs.complete(jobId, List.of()); + + jobs.cancel(jobId, "too late"); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.DONE); + } + + @Test + void cancel_alreadyError_isNoOp() { + String jobId = jobs.createJob(); + jobs.fail(jobId, "failed"); + + jobs.cancel(jobId, "too late"); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.ERROR); + } + + @Test + void cancel_alreadyCanceled_isNoOp() { + String jobId = jobs.createJob(); + jobs.cancel(jobId, "first"); + jobs.cancel(jobId, "second"); + + assertThat(jobs.getJob(jobId).getMessage()).isEqualTo("first"); + } + + @Test + void cancel_unknownJobId_doesNotThrow() { + assertThatCode(() -> jobs.cancel("ghost", "x")).doesNotThrowAnyException(); + } + + @Test + void isCanceled_runningJob_returnsFalse() { + String jobId = jobs.createJob(); + assertThat(jobs.isCanceled(jobId)).isFalse(); + } + + @Test + void isCanceled_canceledJob_returnsTrue() { + String jobId = jobs.createJob(); + jobs.cancel(jobId, "c"); + assertThat(jobs.isCanceled(jobId)).isTrue(); + } + + @Test + void isCanceled_unknownJobId_returnsTrue() { + assertThat(jobs.isCanceled("ghost")).isTrue(); + } + + // ------------------------------------------------------------------------- + // attachFuture + // ------------------------------------------------------------------------- + + @Test + void attachFuture_knownJob_storesFuture() { + String jobId = jobs.createJob(); + java.util.concurrent.Future future = mock(java.util.concurrent.Future.class); + + jobs.attachFuture(jobId, future); + + assertThat(jobs.getJob(jobId).getFuture()).isSameAs(future); + } + + @Test + void attachFuture_unknownJob_doesNotThrow() { + java.util.concurrent.Future future = mock(java.util.concurrent.Future.class); + assertThatCode(() -> jobs.attachFuture("ghost", future)).doesNotThrowAnyException(); + } + + // ------------------------------------------------------------------------- + // Guard branches in update / fail / complete when already CANCELED + // ------------------------------------------------------------------------- + + @Test + void update_canceledJob_isNoOp() { + String jobId = jobs.createJob(); + jobs.cancel(jobId, "c"); + + jobs.update(jobId, 50, "file.csv"); + + assertThat(jobs.getJob(jobId).getCurrentFile()).isNull(); + } + + @Test + void fail_canceledJob_isNoOp() { + String jobId = jobs.createJob(); + jobs.cancel(jobId, "c"); + + jobs.fail(jobId, "error"); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.CANCELED); + } + + @Test + void complete_canceledJob_isNoOp() { + String jobId = jobs.createJob(); + jobs.cancel(jobId, "c"); + + jobs.complete(jobId, List.of()); + + assertThat(jobs.getJob(jobId).getState()).isEqualTo(ProcessingStatusDTO.State.CANCELED); + } } \ No newline at end of file diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml new file mode 100644 index 0000000..a67c053 --- /dev/null +++ b/src/test/resources/logback-test.xml @@ -0,0 +1,16 @@ + + + + %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + + +