From 6082fd4d99a7030a2e1daed855dc5427db4910d0 Mon Sep 17 00:00:00 2001
From: Ph1l1ppGitHub <philipp.engering@t-online.de>
Date: Wed, 28 Jan 2026 15:59:40 +0100
Subject: [PATCH] Follow-up: Anpassungen am Importer & DB-Handling

---
 .../PostgresqlDataInterface_Impl.java         | 37 +++++++++-
 .../uce/corpusimporter/Importer.java          | 68 +++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java b/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java
index 4ce7ecd7..4fa93eb0 100644
--- a/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java
+++ b/uce.portal/uce.common/src/main/java/org/texttechnologylab/uce/common/services/PostgresqlDataInterface_Impl.java
@@ -2128,7 +2128,11 @@ public List<Object[]> getTopicWordsByDocumentId(long documentId) throws Database
 
     public Map<Long, Long> getUnifiedTopicToSentenceMap(long documentId) throws DatabaseOperationException {
         return executeOperationSafely((session) -> {
-            String sql = "SELECT unifiedtopic_id, sentence_id FROM sentencetopics WHERE document_id = :documentId";
+            String sql =
+                    "SELECT unifiedtopic_id, sentence_id " +
+                            "FROM sentencetopics " +
+                            "WHERE document_id = :documentId " +
+                            "AND unifiedtopic_id IS NOT NULL";
 
             var query = session.createNativeQuery(sql)
                     .setParameter("documentId", documentId);
@@ -2270,4 +2274,35 @@ private String escapeSql(String input) {
         return input.replace("(", "\\(").replace(")", "\\)").replace(":", "\\:").replace("|", "\\|");
     }
 
+    /**
+     * Inserts a sentence-level topic classification into the database.
+     * This method matches a sentence by its begin and end
+     * offsets within a given document and inserts a corresponding entry into the sentencetopics table
+     */
+    public int insertSentenceTopicBySpan(long documentId, int begin, int end, String topicLabel, double score)
+            throws DatabaseOperationException {
+
+        return executeOperationSafely((session) -> {
+
+            String sql =
+                    "INSERT INTO sentencetopics (document_id, sentence_id, topiclabel, thetast) " +
+                            "SELECT :docId, s.id, :label, :score " +
+                            "FROM sentence s " +
+                            "WHERE s.document_id = :docId AND s.beginn = :begin AND s.endd = :end " +
+                            "AND NOT EXISTS ( " +
+                            "  SELECT 1 FROM sentencetopics st " +
+                            "  WHERE st.sentence_id = s.id AND st.topiclabel = :label " +
+                            ")";
+
+            var query = session.createNativeQuery(sql);
+            query.setParameter("docId", documentId);
+            query.setParameter("begin", begin);
+            query.setParameter("end", end);
+            query.setParameter("label", topicLabel);
+            query.setParameter("score", score);
+
+            return query.executeUpdate();
+        });
+    }
+
 }
diff --git a/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java b/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java
index bba58c2d..483564aa 100644
--- a/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java
+++ b/uce.portal/uce.corpus-importer/src/main/java/org/texttechnologylab/uce/corpusimporter/Importer.java
@@ -80,6 +80,11 @@
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.CAS;
+
 public class Importer {
 
     private static final Gson gson = new Gson();
@@ -1817,6 +1822,67 @@ private void postProccessCorpus(Corpus corpus, CorpusConfig corpusConfig) {
         logger.info("Done with the corpus postprocessing.");
     }
 
+    /**
+     * Each topic annotation is matched to an existing sentence using
+     * its begin and end offsets. For every (label, score) pair found,
+     * a corresponding entry is inserted into the
+     * table, linking the topic classification to the sentence.
+     * The method only performs sentence-level imports and does not
+     * create unified or aggregated topic representations
+     */
+    private void importSentenceTopicsFromXmiIntoDb(Document document, String xmiFilePath) {
+        try {
+            var jCas = JCasFactory.createJCas();
+
+            try (InputStream raw = Files.newInputStream(Paths.get(xmiFilePath));
+                 InputStream in = xmiFilePath.endsWith(".gz") ? new GZIPInputStream(raw) : raw) {
+
+                CasIOUtils.load(in, jCas.getCas());
+            }
+
+            if (casView != null) {
+                jCas = jCas.getView(casView);
+            }
+
+            var topicAnnos = JCasUtil.select(jCas, org.texttechnologylab.annotation.Topic.class);
+            if (topicAnnos.isEmpty()) {
+                logger.info("No Topic annotations found in XMI: {}", xmiFilePath);
+                return;
+            }
+
+            int inserted = 0;
+
+            for (var topicSpan : topicAnnos) {
+                int begin = topicSpan.getBegin();
+                int end = topicSpan.getEnd();
+
+                var topicsArr = topicSpan.getTopics();
+                if (topicsArr == null || topicsArr.size() == 0) continue;
+
+                for (int i = 0; i < topicsArr.size(); i++) {
+                    var fs = topicsArr.get(i);
+                    if (!(fs instanceof AnnotationComment comment)) continue;
+
+                    String label = comment.getKey();
+                    String valueStr = comment.getValue();
+                    if (label == null || label.isBlank() || valueStr == null || valueStr.isBlank()) continue;
+
+                    double score;
+                    try { score = Double.parseDouble(valueStr); }
+                    catch (NumberFormatException nfe) { continue; }
+
+                    inserted += db.insertSentenceTopicBySpan(document.getId(), begin, end, label, score);
+                }
+            }
+
+            logger.info("Imported sentence topic annotations into sentencetopics: documentId={}, insertedRows={}",
+                    document.getId(), inserted);
+
+        } catch (Exception ex) {
+            logger.error("Error importing sentence topics from XMI into DB. xmi={}", xmiFilePath, ex);
+        }
+    }
+
     /**
      * Here we apply any postprocessing of a document that isn't DUUI and needs the document to be stored once like
      * the rag vector embeddings.
@@ -1825,6 +1891,8 @@ private void postProccessDocument(Document document, Corpus corpus, String fileP
         logImportInfo("Postprocessing " + filePath, LogStatus.POST_PROCESSING, filePath, 0);
         var start = System.currentTimeMillis();
         var corpusConfig = corpus.getViewModel().getCorpusConfig();
+        // Import sentence-level topic annotations (News XMI: annotation2:Topic + AnnotationComment)
+        importSentenceTopicsFromXmiIntoDb(document, filePath);
 
         // Store simple connections between Time, Geonames and Annotation to approximate the question:
         // This annotation occurred in context with this location at this time.