Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2128,7 +2128,11 @@ public List<Object[]> getTopicWordsByDocumentId(long documentId) throws Database

public Map<Long, Long> getUnifiedTopicToSentenceMap(long documentId) throws DatabaseOperationException {
return executeOperationSafely((session) -> {
String sql = "SELECT unifiedtopic_id, sentence_id FROM sentencetopics WHERE document_id = :documentId";
String sql =
"SELECT unifiedtopic_id, sentence_id " +
"FROM sentencetopics " +
"WHERE document_id = :documentId " +
"AND unifiedtopic_id IS NOT NULL";

var query = session.createNativeQuery(sql)
.setParameter("documentId", documentId);
Expand Down Expand Up @@ -2270,4 +2274,35 @@ private String escapeSql(String input) {
return input.replace("(", "\\(").replace(")", "\\)").replace(":", "\\:").replace("|", "\\|");
}

/**
* Inserts a sentence-level topic classification into the database.
* This method matches a sentence by its begin and end
* offsets within a given document and inserts a corresponding entry into the sentencetopics table
*/
public int insertSentenceTopicBySpan(long documentId, int begin, int end, String topicLabel, double score)
throws DatabaseOperationException {

return executeOperationSafely((session) -> {

String sql =
"INSERT INTO sentencetopics (document_id, sentence_id, topiclabel, thetast) " +
"SELECT :docId, s.id, :label, :score " +
"FROM sentence s " +
"WHERE s.document_id = :docId AND s.beginn = :begin AND s.endd = :end " +
"AND NOT EXISTS ( " +
" SELECT 1 FROM sentencetopics st " +
" WHERE st.sentence_id = s.id AND st.topiclabel = :label " +
")";

var query = session.createNativeQuery(sql);
query.setParameter("docId", documentId);
query.setParameter("begin", begin);
query.setParameter("end", end);
query.setParameter("label", topicLabel);
query.setParameter("score", score);

return query.executeUpdate();
});
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.CAS;

public class Importer {

private static final Gson gson = new Gson();
Expand Down Expand Up @@ -1817,6 +1822,67 @@ private void postProccessCorpus(Corpus corpus, CorpusConfig corpusConfig) {
logger.info("Done with the corpus postprocessing.");
}

/**
* Each topic annotation is matched to an existing sentence using
* its begin and end offsets. For every (label, score) pair found,
* a corresponding entry is inserted into the
* table, linking the topic classification to the sentence.
* The method only performs sentence-level imports and does not
* create unified or aggregated topic representations
*/
private void importSentenceTopicsFromXmiIntoDb(Document document, String xmiFilePath) {
try {
var jCas = JCasFactory.createJCas();

try (InputStream raw = Files.newInputStream(Paths.get(xmiFilePath));
InputStream in = xmiFilePath.endsWith(".gz") ? new GZIPInputStream(raw) : raw) {

CasIOUtils.load(in, jCas.getCas());
}

if (casView != null) {
jCas = jCas.getView(casView);
}

var topicAnnos = JCasUtil.select(jCas, org.texttechnologylab.annotation.Topic.class);
if (topicAnnos.isEmpty()) {
logger.info("No Topic annotations found in XMI: {}", xmiFilePath);
return;
}

int inserted = 0;

for (var topicSpan : topicAnnos) {
int begin = topicSpan.getBegin();
int end = topicSpan.getEnd();

var topicsArr = topicSpan.getTopics();
if (topicsArr == null || topicsArr.size() == 0) continue;

for (int i = 0; i < topicsArr.size(); i++) {
var fs = topicsArr.get(i);
if (!(fs instanceof AnnotationComment comment)) continue;

String label = comment.getKey();
String valueStr = comment.getValue();
if (label == null || label.isBlank() || valueStr == null || valueStr.isBlank()) continue;

double score;
try { score = Double.parseDouble(valueStr); }
catch (NumberFormatException nfe) { continue; }

inserted += db.insertSentenceTopicBySpan(document.getId(), begin, end, label, score);
}
}

logger.info("Imported sentence topic annotations into sentencetopics: documentId={}, insertedRows={}",
document.getId(), inserted);

} catch (Exception ex) {
logger.error("Error importing sentence topics from XMI into DB. xmi={}", xmiFilePath, ex);
}
}

/**
* Here we apply any postprocessing of a document that isn't DUUI and needs the document to be stored once like
* the rag vector embeddings.
Expand All @@ -1825,6 +1891,8 @@ private void postProccessDocument(Document document, Corpus corpus, String fileP
logImportInfo("Postprocessing " + filePath, LogStatus.POST_PROCESSING, filePath, 0);
var start = System.currentTimeMillis();
var corpusConfig = corpus.getViewModel().getCorpusConfig();
// Import sentence-level topic annotations (News XMI: annotation2:Topic + AnnotationComment)
importSentenceTopicsFromXmiIntoDb(document, filePath);

// Store simple connections between Time, Geonames and Annotation to approximate the question:
// This annotation occurred in context with this location at this time.
Expand Down