Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
795610a
typo
lpi-tn Jan 29, 2026
9c5f6e4
feat(fao-open-knowledge): implement FAO Open Knowledge data models an…
lpi-tn Jan 29, 2026
161a049
feat(fao-open-knowledge): refactor data models and implement FAO Open…
lpi-tn Jan 30, 2026
d96d455
feat(fao-open-knowledge): add MetadataEntry model and enhance embargo…
lpi-tn Jan 30, 2026
c5e03e7
feat(fao-open-knowledge): implement detail extraction for FAO documen…
lpi-tn Jan 30, 2026
8caade9
feat(fao-open-knowledge): enhance error handling and add unit tests f…
lpi-tn Jan 30, 2026
85acbe9
feat(fao-open-knowledge): improve metadata parsing and update test ca…
lpi-tn Feb 2, 2026
ef2fdbc
feat(fao-open-knowledge): add unit tests for FAO Open Knowledge model…
lpi-tn Feb 2, 2026
772ea9e
Merge branch 'main' into Feature/FAO-open-knowledge
lpi-tn Feb 3, 2026
0a6d5a5
feat(fao_open_knowledge): add Bitstream and Checksum models, enhance …
lpi-tn Feb 3, 2026
a89e554
feat(fao_open_knowledge): implement serialization for dataclass insta…
lpi-tn Feb 3, 2026
3e0898a
feat(fao_open_knowledge): enhance test coverage for document processi…
lpi-tn Feb 3, 2026
4f275bd
Update welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py
lpi-tn Feb 3, 2026
b55ca25
Update welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py
lpi-tn Feb 3, 2026
b83c4c7
feat(fao_open_knowledge): add workflow for FAO open knowledge data co…
lpi-tn Feb 5, 2026
a7a9af1
Merge remote-tracking branch 'origin/Feature/FAO-open-knowledge' into…
lpi-tn Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -498,4 +498,42 @@ spec:
volumeAttributes:
secretName: {{ $.Values.common.azureShare.secret.name }}
shareName: {{ $.Values.common.azureShare.name }}


- name: collect-fao-open-knowledge
synchronization:
semaphores:
- configMapKeyRef:
name: {{ .collectorSemaphore.configmapName }}
key: {{ .collectorSemaphore.standard.keyName }}
container:
{{- with $.Values.image }}
image: {{ include "common.pods.image" (dict "root" $ "image" (dict "repository" .repository "path" .path "tag" .tag))}}
{{- end }}
args:
- python
- "-m"
- welearn_datastack.nodes_workflow.URLCollectors.node_fao_open_knowledge_collect
envFrom:
- configMapRef:
name: {{ .name }}
volumeMounts:
- name: secrets
mountPath: "/secrets"
readOnly: true
- name: azure-share
mountPath: {{ $.Values.common.azureShare.mountPath }}
volumes:
- name: secrets
secret:
secretName: {{ .name }}
- name: azure-share
csi:
driver: file.csi.azure.com
readOnly: true
volumeAttributes:
secretName: {{ $.Values.common.azureShare.secret.name }}
shareName: {{ $.Values.common.azureShare.name }}


{{- end }}
312 changes: 312 additions & 0 deletions tests/document_collector_hub/plugins_test/test_fao_open_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
import unittest
import uuid
from unittest.mock import patch

import requests
from welearn_database.data.models.document_related import WeLearnDocument

from welearn_datastack.data.source_models.fao_open_knowledge import (
BitstreamModel,
BitstreamsLinksModel,
Bundle,
BundleLinksModel,
ChecksumModel,
Item,
Link,
MetadataEntry,
_Links,
)
from welearn_datastack.plugins.rest_requesters.fao_open_knowledge import (
FAOOpenKnowledgeCollector,
)


class TestFAOOpenKnowledgeCollector(unittest.TestCase):
def setUp(self):
self.collector = FAOOpenKnowledgeCollector()
self.doc = WeLearnDocument(
id=1,
url="https://example.org/fao/resource/1234",
external_id="abcd-1234",
details={},
)
self.item = Item(
id="abcd-1234",
uuid="abcd-1234",
name="FAO Document Title",
handle="1234/5678",
metadata={
"dc.rights.license": [
{
"value": "CC-BY-4.0",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"dc.contributor.author": [
{
"value": "John Doe;Jane Smith",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"dc.description.abstract": [
{
"value": "A description.",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"dc.identifier.doi": [
{
"value": "10.1234/fao.5678",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"dc.date.available": [
{
"value": "2023-01-01T00:00:00Z",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"dc.date.lastModified": [
{
"value": "2023-01-02T00:00:00Z",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
"fao.taxonomy.type": [
{
"value": "Report",
"language": "en",
"authority": "FAO",
"confidence": 1,
"place": 0,
}
],
},
inArchive=True,
discoverable=True,
withdrawn=False,
lastModified="2023-01-02T00:00:00Z",
entityType=None,
type="item",
_links=_Links(
item=Link(href=""),
bitstreams=Link(href=""),
primaryBitstream=Link(href=""),
self=Link(href=""),
bundles=Link(href=""),
mappedCollections=Link(href=""),
owningCollection=Link(href=""),
relationships=Link(href=""),
version=Link(href=""),
templateItemOf=Link(href=""),
thumbnail=Link(href=""),
relateditemlistconfigs=Link(href=""),
),
)
self.bundle = Bundle(
uuid="pdf-uuid",
name="ORIGINAL",
handle=None,
metadata={},
type="bundle",
_links=BundleLinksModel(
item=Link(href=""),
bitstreams=Link(href=""),
primaryBitstream=Link(href=""),
self=Link(href=""),
),
)

self.bitstream = BitstreamModel(
id=str(uuid.uuid4()),
uuid=str(uuid.uuid4()),
name="document.pdf",
handle=None,
metadata={},
bundleName="ORIGINAL",
sizeBytes=1024,
checkSum=ChecksumModel(value="checksum-value", checkSumAlgorithm="MD5"),
sequenceId=1,
type="bitstream",
_links=BitstreamsLinksModel(
content=Link(href=""),
bundle=Link(href=""),
format=Link(href=""),
thumbnail=Link(href=""),
self=Link(href=""),
),
)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_embargo(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
# Simulate an embargoed document (inArchive=False)
embargoed_item = self.item.model_copy()
embargoed_item.metadata["fao.embargo"] = MetadataEntry(
value="Yes", language="en", authority="FAO", confidence=1, place=0
)
mock_get_metadata.return_value = embargoed_item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)
self.assertIn("embargo", doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_http_error(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
error = requests.HTTPError("HTTP error")
error.response = requests.Response()
error.response.status_code = 404
mock_get_metadata.side_effect = error
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)
self.assertIn("HTTP error", doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_multiple_documents(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
mock_get_metadata.return_value = self.item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream
doc2 = WeLearnDocument(
id=2,
url="https://example.org/fao/resource/5678",
external_id="efgh-5678",
details={},
)
result = self.collector.run([self.doc, doc2])
self.assertEqual(len(result), 2)
for doc_result in result:
self.assertIsNone(doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_no_pdf(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
mock_get_metadata.return_value = self.item
mock_get_bundle.return_value = []
mock_get_pdf.return_value = ""
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)
self.assertIn("no content", doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_pdf_content_empty(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
mock_get_metadata.return_value = self.item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = ""
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_success(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
mock_get_metadata.return_value = self.item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream

result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNone(doc_result.error_info)
self.assertIn("full_content", doc_result.document.__dict__)
self.assertEqual(
doc_result.document.full_content, "PDF content extracted. Lorem Ipsum."
)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_unauthorized_license(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
unauthorized_item = self.item.model_copy(
update={"metadata": {"dc.rights.license": [{"value": "NO-LICENSE"}]}}
)
mock_get_metadata.return_value = unauthorized_item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)
self.assertIn("unauthorized", doc_result.error_info)

@patch.object(FAOOpenKnowledgeCollector, "get_bitstream_json")
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
def test_run_withdrawn(
self, mock_get_metadata, mock_get_bundle, mock_get_pdf, mock_get_bitstream
):
withdrawn_item = self.item.model_copy(update={"withdrawn": True})
mock_get_metadata.return_value = withdrawn_item
mock_get_bundle.return_value = [self.bundle]
mock_get_pdf.return_value = "PDF content extracted. Lorem Ipsum."
mock_get_bitstream.return_value = self.bitstream
result = self.collector.run([self.doc])
self.assertEqual(len(result), 1)
doc_result = result[0]
self.assertIsNotNone(doc_result.error_info)
self.assertIn("withdrawn", doc_result.error_info)
Empty file added tests/source_models/__init__.py
Empty file.
Loading