diff --git a/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearchBase.java b/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearchBase.java index 75059eeb..9cbb44de 100644 --- a/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearchBase.java +++ b/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearchBase.java @@ -4,6 +4,7 @@ import au.org.aodn.ogcapi.server.core.model.enumeration.CQLFields; import au.org.aodn.ogcapi.server.core.model.enumeration.CQLFieldsInterface; import au.org.aodn.ogcapi.server.core.model.enumeration.StacBasicField; +import au.org.aodn.ogcapi.server.core.model.enumeration.StacSummeries; import co.elastic.clients.elasticsearch.ElasticsearchClient; import co.elastic.clients.elasticsearch._types.*; import co.elastic.clients.elasticsearch._types.aggregations.*; @@ -157,7 +158,6 @@ protected SearchResult searchCollectionBy(final List final List sortOptions, final Double score, final Long maxSize) { - Supplier builderSupplier = () -> { SearchRequest.Builder builder = new SearchRequest.Builder(); builder.index(indexName) @@ -165,23 +165,97 @@ protected SearchResult searchCollectionBy(final List // we use the smaller one. The internal page size is used to get the result by // batch, lets say page is 20 and internal is 10, then we do it in two batch. // But if we request 5 only, then there is no point to load 10 - .size(maxSize != null && maxSize < pageSize ? maxSize.intValue() : pageSize) - .query(q -> q.bool(createBoolQueryForProperties(queries, should, filters))); + .size(maxSize != null && maxSize < pageSize ? maxSize.intValue() : pageSize); + + // use script score if search with text, in such case, the final score depends on both relevance and metadata quality + // put query in script block + // determine to use script score block or not + // only use script_score if sortby contains "score" and should field is not empty + boolean useScriptScore = (sortOptions != null && !sortOptions.isEmpty()) && (should != null && !should.isEmpty()); + + if (useScriptScore) { + String summaryScore = StacSummeries.Score.searchField; + builder.query(q -> q.scriptScore(ss -> ss + // to get the original _score from ELasticsearch + .query(bq -> bq.bool(createBoolQueryForProperties(queries, should, filters))) + .script(s -> s.inline(i -> i + .lang("painless") + .source( + // Step 1: Retrieve internal quality score from summaries.score field + // Default to 0 if field doesn't exist or is empty + "double internalScore = doc.containsKey('"+summaryScore+"') && " + + "!doc['"+summaryScore+"'].empty ? doc['"+summaryScore+"'].value : 0.0; " + + + // Step 2: Normalize internal score to 0-1 range + // Assuming summaries.score is in range 0-106 + "double normalizedScore = internalScore / 106.0; " + + + // Step 3: Ensure minimum multiplier to avoid zero scores + "double multiplier = Math.max(normalizedScore, 0.01); " + + + // Step 4: Calculate final score + // Final score = Elasticsearch relevance * normalized quality + "return _score * multiplier;" + ) + ) + )) + ); + } + // use original query logic + else { + builder.query(q -> q.bool(createBoolQueryForProperties(queries, should, filters))); + } if(searchAfter != null) { builder.searchAfter(searchAfter); } - if(sortOptions != null) { - builder.sort(sortOptions); - } + // to use sort by uuid as a tiebreaker + boolean hasUuidSort = false; - builder.sort(so -> so - // We need a unique key for the search, cannot use _id in v8 anymore, so we need - // to sort using the keyword, this field is not for search and therefore not in enum - .field(FieldSort.of(f -> f - .field(StacBasicField.UUID.sortField) - .order(SortOrder.Asc)))); + // apply sort options + if (useScriptScore) { + // add sort options + if (sortOptions != null && !sortOptions.isEmpty()) { + for (SortOptions sortOption : sortOptions) { + builder.sort(sortOption); + + // check if it has sort by id option + if (sortOption.isField() && + sortOption.field().field().equals(StacBasicField.UUID.sortField)) { + hasUuidSort = true; + } + } + } + } + else { + // when not using script_score, apply all sort options + if (sortOptions != null && !sortOptions.isEmpty()) { + for (SortOptions sortOption : sortOptions) { + builder.sort(sortOption); + + // check if it has sort by id option + if (sortOption.isField() && + sortOption.field().field().equals(StacBasicField.UUID.sortField)) { + hasUuidSort = true; + } + } + } + else if (should != null && !should.isEmpty()) { + // If no sortOptions provided but there are text queries, + // default to sorting by _score + builder.sort(so -> so.score(sc -> sc.order(SortOrder.Desc))); + } + } + // add sort by id as the final tiebreaker if it was applied + if (!hasUuidSort) { + builder.sort(so -> so + // We need a unique key for the search, cannot use _id in v8 anymore, so we need + // to sort using the keyword, this field is not for search and therefore not in enum + .field(FieldSort.of(f -> f + .field(StacBasicField.UUID.sortField) + .order(SortOrder.Asc)))); + } if(score != null) { // By default we do not setup any min_score, the api caller should pass it in so diff --git a/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java b/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java index 1002b57c..4474b47e 100644 --- a/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java +++ b/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java @@ -552,7 +552,7 @@ public void verifyCQLPropertyScore() throws IOException { // Increase score will drop one record collections = testRestTemplate.getForEntity(getBasePath() + "/collections?q='dataset includes'&filter=score>=3", Collections.class); - assertEquals(3, Objects.requireNonNull(collections.getBody()).getCollections().size(), "hit 3, with score 3"); + assertEquals(2, Objects.requireNonNull(collections.getBody()).getCollections().size(), "hit 3, with score 3"); assertEquals("bf287dfe-9ce4-4969-9c59-51c39ea4d011", Objects.requireNonNull(collections.getBody()).getCollections().get(0).getId(), "bf287dfe-9ce4-4969-9c59-51c39ea4d011"); } diff --git a/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java b/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java index 68dda6c8..c357e1f5 100644 --- a/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java +++ b/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java @@ -63,11 +63,17 @@ public void verifyCorrectInternalPagingLargeData() throws IOException { // Given 6 records and we set page to 4, that means each query elastic return 4 record only // and the logic to load the reset can kick in. super.insertJsonToElasticRecordIndex( + // set test summaries.score 90 "5c418118-2581-4936-b6fd-d6bedfe74f62.json", + // set test summaries.score 106 "19da2ce7-138f-4427-89de-a50c724f5f54.json", + // set test summaries.score 70 "516811d7-cd1e-207a-e0440003ba8c79dd.json", + // set test summaries.score 60 "7709f541-fc0c-4318-b5b9-9053aa474e0e.json", + // set test summaries.score 50 "bc55eff4-7596-3565-e044-00144fdd4fa6.json", + // set test summaries.score 100 "bf287dfe-9ce4-4969-9c59-51c39ea4d011.json"); // Call rest api directly and get query result @@ -97,6 +103,7 @@ public void verifyCorrectInternalPagingLargeData() throws IOException { } /** * with page_size set, the max number of record return will equals page_size + * With default search, the sort should follow uuid order */ @Test public void verifyCorrectPageSizeDataReturn() throws IOException { @@ -105,11 +112,17 @@ public void verifyCorrectPageSizeDataReturn() throws IOException { // Given 6 records and we set page to 4, that means each query elastic return 4 record only // and the logic to load the reset can kick in. super.insertJsonToElasticRecordIndex( + // set test summaries.score 90 "5c418118-2581-4936-b6fd-d6bedfe74f62.json", + // set test summaries.score 106 "19da2ce7-138f-4427-89de-a50c724f5f54.json", + // set test summaries.score 70 "516811d7-cd1e-207a-e0440003ba8c79dd.json", + // set test summaries.score 60 "7709f541-fc0c-4318-b5b9-9053aa474e0e.json", + // set test summaries.score 50 "bc55eff4-7596-3565-e044-00144fdd4fa6.json", + // set test summaries.score 100 "bf287dfe-9ce4-4969-9c59-51c39ea4d011.json"); // Call rest api directly and get query result @@ -132,12 +145,12 @@ public void verifyCorrectPageSizeDataReturn() throws IOException { assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); assertEquals("1.0", collections.getBody().getSearchAfter().get(0), "Search after 1 value"); assertEquals( - "100", + "90", collections.getBody().getSearchAfter().get(1), "search_after 2 arg" ); assertEquals( - "str:bf287dfe-9ce4-4969-9c59-51c39ea4d011", + "str:5c418118-2581-4936-b6fd-d6bedfe74f62", collections.getBody().getSearchAfter().get(2), "search_after 3 arg" ); @@ -185,6 +198,11 @@ public void verifyCorrectPageSizeDataReturn() throws IOException { * Extreme case, page size set to 1 and query text "dataset" and page one by one. Only part of the json * will be return, the sort value should give you the next item and you will be able to go to next one. * The first sort value is the relevant and because of query text the value will be something greater than 1.0 + * After weighted sorting, the actual order is (for the first 4 records): + * Document 0: UUID=bf287dfe-9ce4-4969-9c59-51c39ea4d011 + * Document 1: UUID=19da2ce7-138f-4427-89de-a50c724f5f54 + * Document 2: UUID=bc55eff4-7596-3565-e044-00144fdd4fa6 + * Document 3: UUID=7709f541-fc0c-4318-b5b9-9053aa474e0e */ @Test public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { @@ -193,11 +211,17 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { // Given 6 records and we set page to 4, that means each query elastic return 4 record only // and the logic to load the reset can kick in. super.insertJsonToElasticRecordIndex( + // set test summaries.score 90 "5c418118-2581-4936-b6fd-d6bedfe74f62.json", + // set test summaries.score 106 "19da2ce7-138f-4427-89de-a50c724f5f54.json", + // set test summaries.score 70 "516811d7-cd1e-207a-e0440003ba8c79dd.json", + // set test summaries.score 60 "7709f541-fc0c-4318-b5b9-9053aa474e0e.json", + // set test summaries.score 50 "bc55eff4-7596-3565-e044-00144fdd4fa6.json", + // set test summaries.score 100 "bf287dfe-9ce4-4969-9c59-51c39ea4d011.json"); // Call rest api directly and get query result with search on "dataset" @@ -220,7 +244,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); assertEquals( - "str:bc55eff4-7596-3565-e044-00144fdd4fa6", + "str:bf287dfe-9ce4-4969-9c59-51c39ea4d011", collections.getBody().getSearchAfter().get(2), "search_after 2 arg" ); @@ -232,7 +256,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { String.format("'%s||%s||%s'", collections.getBody().getSearchAfter().get(0), collections.getBody().getSearchAfter().get(1), - "bc55eff4-7596-3565-e044-00144fdd4fa6"), + "bf287dfe-9ce4-4969-9c59-51c39ea4d011"), HttpMethod.GET, null, new ParameterizedTypeReference<>() { @@ -249,7 +273,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); assertEquals( - "str:7709f541-fc0c-4318-b5b9-9053aa474e0e", + "str:19da2ce7-138f-4427-89de-a50c724f5f54", collections.getBody().getSearchAfter().get(2), "search_after 3 arg" ); @@ -278,7 +302,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); assertEquals( - "str:19da2ce7-138f-4427-89de-a50c724f5f54", + "str:5c418118-2581-4936-b6fd-d6bedfe74f62", collections.getBody().getSearchAfter().get(2), "search_after 3 value" ); @@ -286,6 +310,11 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { /** * Similar to verifyCorrectPageSizeDataReturnWithQuery and add score in the query, * this is used to verify a bug fix where page_size and score crash the query + * After weighted sorting, the actual order is (for the first 4 records): + * Document 0: UUID=bf287dfe-9ce4-4969-9c59-51c39ea4d011 + * Document 1: UUID=19da2ce7-138f-4427-89de-a50c724f5f54 + * Document 2: UUID=bc55eff4-7596-3565-e044-00144fdd4fa6 + * Document 3: UUID=7709f541-fc0c-4318-b5b9-9053aa474e0e */ @Test public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { @@ -296,11 +325,17 @@ public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { // Given 6 records and we set page to 4, that means each query elastic return 4 record only // and the logic to load the reset can kick in. super.insertJsonToElasticRecordIndex( + // set test summaries.score 90 "5c418118-2581-4936-b6fd-d6bedfe74f62.json", + // set test summaries.score 106 "19da2ce7-138f-4427-89de-a50c724f5f54.json", + // set test summaries.score 70 "516811d7-cd1e-207a-e0440003ba8c79dd.json", + // set test summaries.score 60 "7709f541-fc0c-4318-b5b9-9053aa474e0e.json", + // set test summaries.score 50 "bc55eff4-7596-3565-e044-00144fdd4fa6.json", + // set test summaries.score 100 "bf287dfe-9ce4-4969-9c59-51c39ea4d011.json"); // Call rest api directly and get query result with search on "dataset" @@ -329,12 +364,12 @@ public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { logger.debug("verifyCorrectPageSizeAndScoreWithQuery - search after {}", collections.getBody().getSearchAfter()); assertEquals( - "80", + "100", collections.getBody().getSearchAfter().get(1), "search_after 2 value" ); assertEquals( - "str:bc55eff4-7596-3565-e044-00144fdd4fa6", + "str:bf287dfe-9ce4-4969-9c59-51c39ea4d011", collections.getBody().getSearchAfter().get(2), "search_after 3 value" ); @@ -346,7 +381,7 @@ public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { String.format("'%s|| %s || %s'", collections.getBody().getSearchAfter().get(0), collections.getBody().getSearchAfter().get(1), - "bc55eff4-7596-3565-e044-00144fdd4fa6"), + "bf287dfe-9ce4-4969-9c59-51c39ea4d011"), HttpMethod.GET, null, new ParameterizedTypeReference<>() { diff --git a/server/src/test/resources/databag/19da2ce7-138f-4427-89de-a50c724f5f54.json b/server/src/test/resources/databag/19da2ce7-138f-4427-89de-a50c724f5f54.json index b0103cbb..24ecbe39 100644 --- a/server/src/test/resources/databag/19da2ce7-138f-4427-89de-a50c724f5f54.json +++ b/server/src/test/resources/databag/19da2ce7-138f-4427-89de-a50c724f5f54.json @@ -28,7 +28,7 @@ ] }, "summaries": { - "score": 100, + "score": 106, "status": "completed", "credits": [ "Australia’s Integrated Marine Observing System (IMOS) is enabled by the National Collaborative Research Infrastructure Strategy (NCRIS). It is operated by a consortium of institutions as an unincorporated joint venture, with the University of Tasmania as Lead Agent.", diff --git a/server/src/test/resources/databag/5c418118-2581-4936-b6fd-d6bedfe74f62.json b/server/src/test/resources/databag/5c418118-2581-4936-b6fd-d6bedfe74f62.json index f30af378..4faf1702 100644 --- a/server/src/test/resources/databag/5c418118-2581-4936-b6fd-d6bedfe74f62.json +++ b/server/src/test/resources/databag/5c418118-2581-4936-b6fd-d6bedfe74f62.json @@ -244,7 +244,7 @@ } ], "summaries": { - "score": 100, + "score": 90, "dataset_provider": null, "dataset_group": ["aodn"], "proj:geometry": { diff --git a/server/src/test/resources/databag/7709f541-fc0c-4318-b5b9-9053aa474e0e.json b/server/src/test/resources/databag/7709f541-fc0c-4318-b5b9-9053aa474e0e.json index 70fa8279..c726bcb3 100644 --- a/server/src/test/resources/databag/7709f541-fc0c-4318-b5b9-9053aa474e0e.json +++ b/server/src/test/resources/databag/7709f541-fc0c-4318-b5b9-9053aa474e0e.json @@ -28,7 +28,7 @@ ] }, "summaries": { - "score": 95, + "score": 60, "status": "completed", "credits": [ "Australian Climate Change Science Program", diff --git a/server/src/test/resources/databag/bc55eff4-7596-3565-e044-00144fdd4fa6.json b/server/src/test/resources/databag/bc55eff4-7596-3565-e044-00144fdd4fa6.json index 3c4284c3..6529575a 100644 --- a/server/src/test/resources/databag/bc55eff4-7596-3565-e044-00144fdd4fa6.json +++ b/server/src/test/resources/databag/bc55eff4-7596-3565-e044-00144fdd4fa6.json @@ -30,7 +30,7 @@ ] }, "summaries": { - "score": 80, + "score": 50, "status": "", "scope": { "code": "nonGeographicDataset",