From b23619151abe0fc7194692eee7a9c05674f28ddc Mon Sep 17 00:00:00 2001 From: Paris Alexandre Date: Thu, 8 Aug 2024 16:40:37 +0200 Subject: [PATCH] Bug #13230: upgrade CSV metadata --- .../inout/importer/CSVMetadataFormatter.java | 118 +++++------------- ...SVMetadataToDataObjectPackageImporter.java | 63 ++-------- .../sedalib/inout/importer/model/Line.java | 39 ++++++ .../inout/importer/model/MetadataTag.java | 44 +++++++ .../importer/model/ValueAttrMetadataTag.java | 11 ++ .../importer/CSVMetadataFormatterTest.java | 54 +++++++- .../resources/import/expected_content_01.xml | 22 ++++ 7 files changed, 204 insertions(+), 147 deletions(-) create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/Line.java create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/MetadataTag.java create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/ValueAttrMetadataTag.java create mode 100644 sedalib/src/test/resources/import/expected_content_01.xml diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatter.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatter.java index f4f4f47d..d9737cfd 100644 --- a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatter.java +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatter.java @@ -1,6 +1,9 @@ package fr.gouv.vitam.tools.sedalib.inout.importer; +import fr.gouv.vitam.tools.sedalib.inout.importer.model.MetadataTag; +import fr.gouv.vitam.tools.sedalib.inout.importer.model.ValueAttrMetadataTag; import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.StringEscapeUtils; import java.nio.file.Path; @@ -16,60 +19,7 @@ * It's a utility class for {@link CSVMetadataToDataObjectPackageImporter} to analyse header line, and then interpret other lines through the formatter. */ public class CSVMetadataFormatter { - /** - * Utility class for metadata formatter. - */ - private class MetadataTag { - - String name; - String value; - String attr; - MetadataTag parent; - LinkedHashMap> subTags; - - /** - * Instantiates a new Metadata tag. - * - * @param name the name - * @param parent the parent - */ - protected MetadataTag(String name, MetadataTag parent) throws SEDALibException { - if ((name != null) && (!name.matches("[a-zA-Z0-9_-]+"))) { - throw new SEDALibException("Caractère interdit dans le tag XML [" + name + "]"); - } - this.name = name; - this.value = null; - this.attr = null; - this.subTags = null; - this.parent = parent; - } - - @Override - public String toString() { - if (parent == null) { - return name; - } else { - return parent.toString() + "." + name; - } - } - } - - private class ValueAttrMetadataTag { - boolean isValue; - MetadataTag tag; - - /** - * Instantiates a new Value attr metadata tag. - * - * @param isValue the is value - * @param tag the tag - */ - public ValueAttrMetadataTag(boolean isValue, MetadataTag tag) { - this.isValue = isValue; - this.tag = tag; - } - } private static final String ID = "id"; private static final String FILE = "file"; @@ -83,7 +33,7 @@ public ValueAttrMetadataTag(boolean isValue, MetadataTag tag) { ID, FILE, PARENTID, PARENTFILE, OBJECTFILES ); private MetadataTag rootTag, contentTag, managementTag; - private LinkedHashMap tagHeaderColumnMapping; + private List tagHeaderColumnMapping; private int numberOfMandatoryHeaderFound; private int columnCount; @@ -93,7 +43,7 @@ public ValueAttrMetadataTag(boolean isValue, MetadataTag tag) { private int fileColumn; private int objectfilesColumn; private int parentGUIDColumn; - private boolean isOnlyFile; + private boolean isOnlyFile = false; private void analyseFirstColumns(String[] headerRow) throws SEDALibException { @@ -113,31 +63,26 @@ private void analyseFirstColumns(String[] headerRow) throws SEDALibException { objectfilesColumn = -1; parentGUIDColumn = -1; } else if (numberOfMandatoryHeaderFound == 2 && firstsMandatoryHeadersFound.containsAll(List.of(FILE, PARENTFILE))) { - isOnlyFile = false; guidColumn = firstsMandatoryHeadersFound.indexOf(FILE); fileColumn = guidColumn; objectfilesColumn = -1; parentGUIDColumn = firstsMandatoryHeadersFound.indexOf(PARENTFILE); } else if (numberOfMandatoryHeaderFound == 3 && firstsMandatoryHeadersFound.containsAll(List.of(FILE, PARENTFILE, ID))) { - isOnlyFile = false; guidColumn = firstsMandatoryHeadersFound.indexOf(ID); fileColumn = firstsMandatoryHeadersFound.indexOf(FILE); objectfilesColumn = -1; parentGUIDColumn = firstsMandatoryHeadersFound.indexOf(PARENTFILE); } else if (numberOfMandatoryHeaderFound == 3 && firstsMandatoryHeadersFound.containsAll(List.of(FILE, PARENTID, ID))) { - isOnlyFile = false; guidColumn = firstsMandatoryHeadersFound.indexOf(ID); fileColumn = firstsMandatoryHeadersFound.indexOf(FILE); objectfilesColumn = -1; parentGUIDColumn = firstsMandatoryHeadersFound.indexOf(PARENTID); } else if (numberOfMandatoryHeaderFound == 3 && firstsMandatoryHeadersFound.containsAll(List.of(OBJECTFILES, PARENTID, ID))) { - isOnlyFile = false; guidColumn = firstsMandatoryHeadersFound.indexOf(ID); fileColumn = -1; objectfilesColumn = firstsMandatoryHeadersFound.indexOf(OBJECTFILES); parentGUIDColumn = firstsMandatoryHeadersFound.indexOf(PARENTID); } else if (numberOfMandatoryHeaderFound == 4 && firstsMandatoryHeadersFound.containsAll(List.of(FILE, OBJECTFILES, PARENTID, ID))) { - isOnlyFile = false; guidColumn = firstsMandatoryHeadersFound.indexOf(ID); objectfilesColumn = firstsMandatoryHeadersFound.indexOf(OBJECTFILES); fileColumn = firstsMandatoryHeadersFound.indexOf(FILE); @@ -208,7 +153,7 @@ private MetadataTag getTag(MetadataTag tag, List splittedMetadataName) t private void analyseTags(String[] headerRow) throws SEDALibException { MetadataTag currentTag = null; - ValueAttrMetadataTag vamt; + ValueAttrMetadataTag valueAttrMetadataTag; if (headerRow.length <= numberOfMandatoryHeaderFound) { throw new SEDALibException("Pas de colonne de métadonnées."); @@ -221,21 +166,23 @@ private void analyseTags(String[] headerRow) throws SEDALibException { contentTag = rootTag; } managementTag = null; - tagHeaderColumnMapping = new LinkedHashMap<>(); + tagHeaderColumnMapping = new ArrayList<>(); for (int i = numberOfMandatoryHeaderFound; i < headerRow.length; i++) { - if (headerRow[i].equalsIgnoreCase("attr")) { + String value = headerRow[i]; + if (value.equalsIgnoreCase("attr")) { if (currentTag == null) { throw new SEDALibException("Le header attr en colonne n°" + i + " ne peut pas s'appliquer."); } - vamt = new ValueAttrMetadataTag(false, currentTag); - } else if (headerRow[i].endsWith(".attr")) { - currentTag = getTag(rootTag, new ArrayList<>(Arrays.asList(headerRow[i].split("\\.")))); - vamt = new ValueAttrMetadataTag(false, currentTag); + valueAttrMetadataTag = new ValueAttrMetadataTag(false, currentTag); + } else if (value.endsWith(".attr")) { + currentTag = getTag(rootTag, new ArrayList<>(Arrays.asList(value.split("\\.")))); + valueAttrMetadataTag = new ValueAttrMetadataTag(false, currentTag); } else { - currentTag = getTag(rootTag, new ArrayList<>(Arrays.asList(headerRow[i].split("\\.")))); - vamt = new ValueAttrMetadataTag(true, currentTag); + currentTag = getTag(rootTag, new ArrayList<>(Arrays.asList(value.split("\\.")))); + + valueAttrMetadataTag = new ValueAttrMetadataTag(true, currentTag); } - tagHeaderColumnMapping.put(i, vamt); + tagHeaderColumnMapping.add(valueAttrMetadataTag); } if (contentTag == null) { throw new SEDALibException("Pas de colonne de métadonnées Content."); @@ -257,14 +204,14 @@ public CSVMetadataFormatter(String[] headerRow, Path baseDir) throws SEDALibExce } private void resetValues() { - tagHeaderColumnMapping.values().stream().forEach(valueAttr -> { + tagHeaderColumnMapping.forEach(valueAttr -> { valueAttr.tag.value = null; valueAttr.tag.attr = null; }); } private void defineColumnValue(int headerColumn, String cell) { - ValueAttrMetadataTag vamt = tagHeaderColumnMapping.get(headerColumn); + ValueAttrMetadataTag vamt = tagHeaderColumnMapping.get(headerColumn - 1); if (vamt.isValue) { vamt.tag.value = cell; } else { @@ -286,46 +233,47 @@ private String getOneSubTagXML(MetadataTag tag, String subTagName) { } private boolean notEmptyValues(MetadataTag tag) { - boolean result = false; if (tag.subTags == null) { - return !((tag.value == null) || tag.value.isEmpty()); + return StringUtils.isNotEmpty(tag.value); } for (List tagList : tag.subTags.values()) { for (MetadataTag subTag : tagList) { - result = result || notEmptyValues(subTag); + if (notEmptyValues(subTag)) { + return true; + } } } - return result; + return false; } private String generateRuleTypeTagXML(MetadataTag tag) throws SEDALibException { - String result = ""; + StringBuilder result = new StringBuilder(); for (List tagList : tag.subTags.values()) { for (MetadataTag mt : tagList) { if (mt.name.equals("Rule") && !mt.value.isEmpty()) { - result += "" + StringEscapeUtils.escapeXml10(mt.value) + ""; + result.append("").append(StringEscapeUtils.escapeXml10(mt.value)).append(""); mt.value = null; } } for (MetadataTag mt : tagList) { if (mt.name.equals("StartDate") && !mt.value.isEmpty()) { - result += "" + StringEscapeUtils.escapeXml10(mt.value) + ""; + result.append("").append(StringEscapeUtils.escapeXml10(mt.value)).append(""); mt.value = null; } } } - result += getOneSubTagXML(tag, "PreventInheritance"); - result += getOneSubTagXML(tag, "RefNonRuleId"); - result += getOneSubTagXML(tag, "FinalAction"); + result.append(getOneSubTagXML(tag, "PreventInheritance")); + result.append(getOneSubTagXML(tag, "RefNonRuleId")); + result.append(getOneSubTagXML(tag, "FinalAction")); if (notEmptyValues(tag)) { throw new SEDALibException("La règle [" + tag.name + "] contient des champs non conformes SEDA."); } - if (!result.isEmpty()) { - result = "<" + tag.name + ">" + result + ""; + if (result.length() > 0) { + result = new StringBuilder("<" + tag.name + ">" + result + ""); } - return result; + return result.toString(); } private String generateHoldRuleTagXML(MetadataTag tag) throws SEDALibException { diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataToDataObjectPackageImporter.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataToDataObjectPackageImporter.java index 8ef0968f..d8938cfa 100644 --- a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataToDataObjectPackageImporter.java +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataToDataObjectPackageImporter.java @@ -35,6 +35,7 @@ import fr.gouv.vitam.tools.sedalib.core.BinaryDataObject; import fr.gouv.vitam.tools.sedalib.core.DataObjectGroup; import fr.gouv.vitam.tools.sedalib.core.DataObjectPackage; +import fr.gouv.vitam.tools.sedalib.inout.importer.model.Line; import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; import fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger; @@ -94,60 +95,6 @@ */ public class CSVMetadataToDataObjectPackageImporter { - private class Line { - /** - * The Guid. - */ - String guid; - /** - * The Parent guid. - */ - String parentGUID; - /** - * The File. - */ - String file; - - /** - * The Object Files. - */ - List objectFiles; - - /** - * The Content xml metadata. - */ - String contentXMLMetadata; - /** - * The Management xml metadata. - */ - String managementXMLMetadata; - /** - * The Au. - */ - ArchiveUnit au; - - /** - * Instantiates a new Line. - * - * @param guid the guid - * @param parentGUID the parent guid - * @param file the file - * @param contentXMLMetadata the content xml metadata - */ - public Line(String guid, String parentGUID, String file, String objectFiles, String contentXMLMetadata, String managementXMLMetadata) { - this.guid = guid; - this.parentGUID = parentGUID; - this.file = file; - if (objectFiles.trim().isEmpty()) - this.objectFiles = Arrays.asList(); - else - this.objectFiles = Arrays.asList(objectFiles.split("\\|")); - this.contentXMLMetadata = contentXMLMetadata; - this.managementXMLMetadata = managementXMLMetadata; - this.au = null; - } - } - /** * The csv metadata file name . */ @@ -241,8 +188,12 @@ private boolean readCSVFile() throws SEDALibException, InterruptedException { continue; } try { - currentLine = new Line(metadataFormatter.getGUID(row), metadataFormatter.getParentGUID(row), //NOSONAR - metadataFormatter.getFile(row), metadataFormatter.getObjectFiles(row), metadataFormatter.doFormatAndExtractContentXML(row), metadataFormatter.extractManagementXML()); + currentLine = new Line(metadataFormatter.getGUID(row), + metadataFormatter.getParentGUID(row), //NOSONAR + metadataFormatter.getFile(row), + metadataFormatter.getObjectFiles(row), + metadataFormatter.doFormatAndExtractContentXML(row), + metadataFormatter.extractManagementXML()); } catch (SEDALibException e) { throw new SEDALibException("Erreur sur la ligne "+lineCount, e); } diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/Line.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/Line.java new file mode 100644 index 00000000..7c8337ad --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/Line.java @@ -0,0 +1,39 @@ +package fr.gouv.vitam.tools.sedalib.inout.importer.model; + + +import fr.gouv.vitam.tools.sedalib.core.ArchiveUnit; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class Line { + + public String guid; + public String parentGUID; + public String file; + public List objectFiles; + + /** + * everything inside + */ + public String contentXMLMetadata; + /** + * everything inside + */ + public String managementXMLMetadata; + public ArchiveUnit au; + + public Line(String guid, String parentGUID, String file, String objectFiles, String contentXMLMetadata, String managementXMLMetadata) { + this.guid = guid; + this.parentGUID = parentGUID; + this.file = file; + if (objectFiles.trim().isEmpty()) + this.objectFiles = Collections.emptyList(); + else + this.objectFiles = Arrays.asList(objectFiles.split("\\|")); + this.contentXMLMetadata = contentXMLMetadata; + this.managementXMLMetadata = managementXMLMetadata; + this.au = null; + } +} diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/MetadataTag.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/MetadataTag.java new file mode 100644 index 00000000..269a6e5b --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/MetadataTag.java @@ -0,0 +1,44 @@ +package fr.gouv.vitam.tools.sedalib.inout.importer.model; + +import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; + +import java.util.LinkedHashMap; +import java.util.List; + +/** + * Utility class for metadata formatter. + */ +public class MetadataTag { + + public String name; + public String value; + public String attr; + public MetadataTag parent; + public LinkedHashMap> subTags; + + /** + * Instantiates a new Metadata tag. + * + * @param name the name + * @param parent the parent + */ + public MetadataTag(String name, MetadataTag parent) throws SEDALibException { + if ((name != null) && (!name.matches("[a-zA-Z0-9_-]+"))) { + throw new SEDALibException("Caractère interdit dans le tag XML [" + name + "]"); + } + this.name = name; + this.value = null; + this.attr = null; + this.subTags = null; + this.parent = parent; + } + + @Override + public String toString() { + if (parent == null) { + return name; + } else { + return parent + "." + name; + } + } +} \ No newline at end of file diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/ValueAttrMetadataTag.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/ValueAttrMetadataTag.java new file mode 100644 index 00000000..bea6e455 --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/inout/importer/model/ValueAttrMetadataTag.java @@ -0,0 +1,11 @@ +package fr.gouv.vitam.tools.sedalib.inout.importer.model; + +import lombok.AllArgsConstructor; + +@AllArgsConstructor +public class ValueAttrMetadataTag { + + public boolean isValue; + public MetadataTag tag; + +} \ No newline at end of file diff --git a/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatterTest.java b/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatterTest.java index c62e9055..de878ee5 100644 --- a/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatterTest.java +++ b/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/inout/importer/CSVMetadataFormatterTest.java @@ -8,22 +8,64 @@ import java.io.FileNotFoundException; import java.nio.file.Paths; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + class CSVMetadataFormatterTest { @Test - void newCSVMetadataFormatter_tests_bad_headers() { + void new_test_OK() throws FileNotFoundException, SEDALibException { + String[] row = {"Id", "ParentId", "File", "ObjectFiles", "Content.DescriptionLevel", "Content.Title", "Content.Description", "Content.TransactedDate", "Content.StartDate"}; + CSVMetadataFormatter csvMetadataFormatter = new CSVMetadataFormatter(row, Paths.get("/baseDir/")); + Assertions.assertTrue(csvMetadataFormatter.isExtendedFormat()); + } + + @Test + void new_test_with_bad_headers() { final String[] bad_row = {"One", "Two", "three"}; Assertions.assertThrows(SEDALibException.class, () -> new CSVMetadataFormatter(bad_row, Paths.get("whatever"))); String[] row_with_one_error = {"Id", "ParentId", "File", "ObjectFiles", "Content.DescriptionLevel", "Content.Title", "Content.Description", "Content.TransactedDate", "Content.StartDate", "CeaCategory"}; - Assertions.assertThrows(SEDALibException.class, () -> new CSVMetadataFormatter(row_with_one_error, Paths.get("whatever"))); + Assertions.assertThrows(SEDALibException.class, () -> new CSVMetadataFormatter(row_with_one_error, Paths.get("/baseDir/"))); } @Test - void newCSVMetadataFormatter_test_OK() throws FileNotFoundException, SEDALibException { - String[] row = {"Id", "ParentId", "File", "ObjectFiles", "Content.DescriptionLevel", "Content.Title", "Content.Description", "Content.TransactedDate", "Content.StartDate"}; - CSVMetadataFormatter csvMetadataFormatter = new CSVMetadataFormatter(row, ResourceUtils.getResourcePath("metadata_OK.csv").getParent()); - Assertions.assertTrue(csvMetadataFormatter.isExtendedFormat()); + void doFormatAndExtractContentXML_test_OK() throws FileNotFoundException, SEDALibException { + String[] row1 = { + "File", + "Content.DescriptionLevel", + "Content.Title", + "Content.SigningInformation.SigningRole.0", + "Content.SigningInformation.SigningRole.1", + "Content.SigningInformation.SigningRole.2", + "Content.SigningInformation.DetachedSigningRole", + "Content.SigningInformation.SignatureDescription.0.Signer.FullName", + "Content.SigningInformation.SignatureDescription.0.Signer.SigningTime", + "Content.SigningInformation.TimestampingInformation.TimeStamp", + "Content.SigningInformation.AdditionalProof.0.AdditionalProofInformation" + }; + String[] row2 = { + "whatever.pdf", + "Item", + "whatever", + "SignedDocument", + "Signature", + "Timestamp", + "AdditionalProof", + "Alexandre PARIS", + "2024-12-25T12:34:56", + "2023-06-22T11:36:49", + "AdditionalProofInformation" + }; + CSVMetadataFormatter metadataFormatter = new CSVMetadataFormatter(row1, Paths.get("/baseDir/")); + metadataFormatter.isExtendedFormat(); + + Assertions.assertEquals("/baseDir/whatever.pdf", metadataFormatter.getGUID(row2)); + Assertions.assertEquals("/baseDir", metadataFormatter.getParentGUID(row2)); + Assertions.assertEquals("/baseDir/whatever.pdf", metadataFormatter.getFile(row2)); + Assertions.assertEquals("", metadataFormatter.getObjectFiles(row2)); + assertThat(metadataFormatter.doFormatAndExtractContentXML(row2)).isEqualToIgnoringWhitespace(ResourceUtils.getResourceAsString("import/expected_content_01.xml")); + Assertions.assertEquals("", metadataFormatter.extractManagementXML()); } + } diff --git a/sedalib/src/test/resources/import/expected_content_01.xml b/sedalib/src/test/resources/import/expected_content_01.xml new file mode 100644 index 00000000..75478ee2 --- /dev/null +++ b/sedalib/src/test/resources/import/expected_content_01.xml @@ -0,0 +1,22 @@ + + Item + whatever + + SignedDocument + AdditionalProof + + + Alexandre PARIS + 2024-12-25T12:34:56 + + + + 2023-06-22T11:36:49 + + + AdditionalProofInformation + + Signature + Timestamp + + \ No newline at end of file