From 6082fd4d99a7030a2e1daed855dc5427db4910d0 Mon Sep 17 00:00:00 2001 From: Ph1l1ppGitHub Date: Wed, 28 Jan 2026 15:59:40 +0100 Subject: [PATCH] Follow-up: Anpassungen am Importer & DB-Handling --- .../PostgresqlDataInterface_Impl.java | 37 +++++++++- .../uce/corpusimporter/Importer.java | 68 +++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java b/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java index 4ce7ecd7..4fa93eb0 100644 --- a/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java +++ b/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java @@ -2128,7 +2128,11 @@ public List getTopicWordsByDocumentId(long documentId) throws Database public Map getUnifiedTopicToSentenceMap(long documentId) throws DatabaseOperationException { return executeOperationSafely((session) -> { - String sql = "SELECT unifiedtopic_id, sentence_id FROM sentencetopics WHERE document_id = :documentId"; + String sql = + "SELECT unifiedtopic_id, sentence_id " + + "FROM sentencetopics " + + "WHERE document_id = :documentId " + + "AND unifiedtopic_id IS NOT NULL"; var query = session.createNativeQuery(sql) .setParameter("documentId", documentId); @@ -2270,4 +2274,35 @@ private String escapeSql(String input) { return input.replace("(", "\\(").replace(")", "\\)").replace(":", "\\:").replace("|", "\\|"); } + /** + * Inserts a sentence-level topic classification into the database. + * This method matches a sentence by its begin and end + * offsets within a given document and inserts a corresponding entry into the sentencetopics table + */ + public int insertSentenceTopicBySpan(long documentId, int begin, int end, String topicLabel, double score) + throws DatabaseOperationException { + + return executeOperationSafely((session) -> { + + String sql = + "INSERT INTO sentencetopics (document_id, sentence_id, topiclabel, thetast) " + + "SELECT :docId, s.id, :label, :score " + + "FROM sentence s " + + "WHERE s.document_id = :docId AND s.beginn = :begin AND s.endd = :end " + + "AND NOT EXISTS ( " + + " SELECT 1 FROM sentencetopics st " + + " WHERE st.sentence_id = s.id AND st.topiclabel = :label " + + ")"; + + var query = session.createNativeQuery(sql); + query.setParameter("docId", documentId); + query.setParameter("begin", begin); + query.setParameter("end", end); + query.setParameter("label", topicLabel); + query.setParameter("score", score); + + return query.executeUpdate(); + }); + } + } diff --git a/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java b/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java index bba58c2d..483564aa 100644 --- a/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java +++ b/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java @@ -80,6 +80,11 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.CAS; + public class Importer { private static final Gson gson = new Gson(); @@ -1817,6 +1822,67 @@ private void postProccessCorpus(Corpus corpus, CorpusConfig corpusConfig) { logger.info("Done with the corpus postprocessing."); } + /** + * Each topic annotation is matched to an existing sentence using + * its begin and end offsets. For every (label, score) pair found, + * a corresponding entry is inserted into the + * table, linking the topic classification to the sentence. + * The method only performs sentence-level imports and does not + * create unified or aggregated topic representations + */ + private void importSentenceTopicsFromXmiIntoDb(Document document, String xmiFilePath) { + try { + var jCas = JCasFactory.createJCas(); + + try (InputStream raw = Files.newInputStream(Paths.get(xmiFilePath)); + InputStream in = xmiFilePath.endsWith(".gz") ? new GZIPInputStream(raw) : raw) { + + CasIOUtils.load(in, jCas.getCas()); + } + + if (casView != null) { + jCas = jCas.getView(casView); + } + + var topicAnnos = JCasUtil.select(jCas, org.texttechnologylab.annotation.Topic.class); + if (topicAnnos.isEmpty()) { + logger.info("No Topic annotations found in XMI: {}", xmiFilePath); + return; + } + + int inserted = 0; + + for (var topicSpan : topicAnnos) { + int begin = topicSpan.getBegin(); + int end = topicSpan.getEnd(); + + var topicsArr = topicSpan.getTopics(); + if (topicsArr == null || topicsArr.size() == 0) continue; + + for (int i = 0; i < topicsArr.size(); i++) { + var fs = topicsArr.get(i); + if (!(fs instanceof AnnotationComment comment)) continue; + + String label = comment.getKey(); + String valueStr = comment.getValue(); + if (label == null || label.isBlank() || valueStr == null || valueStr.isBlank()) continue; + + double score; + try { score = Double.parseDouble(valueStr); } + catch (NumberFormatException nfe) { continue; } + + inserted += db.insertSentenceTopicBySpan(document.getId(), begin, end, label, score); + } + } + + logger.info("Imported sentence topic annotations into sentencetopics: documentId={}, insertedRows={}", + document.getId(), inserted); + + } catch (Exception ex) { + logger.error("Error importing sentence topics from XMI into DB. xmi={}", xmiFilePath, ex); + } + } + /** * Here we apply any postprocessing of a document that isn't DUUI and needs the document to be stored once like * the rag vector embeddings. @@ -1825,6 +1891,8 @@ private void postProccessDocument(Document document, Corpus corpus, String fileP logImportInfo("Postprocessing " + filePath, LogStatus.POST_PROCESSING, filePath, 0); var start = System.currentTimeMillis(); var corpusConfig = corpus.getViewModel().getCorpusConfig(); + // Import sentence-level topic annotations (News XMI: annotation2:Topic + AnnotationComment) + importSentenceTopicsFromXmiIntoDb(document, filePath); // Store simple connections between Time, Geonames and Annotation to approximate the question: // This annotation occurred in context with this location at this time.