hotfix identify, update README with dataclass outputs

MichalGawor · MichalGawor · commit 88998b45ac40 · 2025-08-28T12:04:25.000+02:00
diff --git a/README.md b/README.md
@@ -43,17 +43,17 @@ By default, returns dictionary, if format=='jsons' returns a JSON string.
 ```
 
 returns:
-```JSON
-{
-  "ref_files": 
-    [
-      {"resource_type": "Resource", "filename": "", "pid": "https://wiki.korpus.cz/doku.php/en:cnk:etalon"}, 
-      {"resource_type": "Resource", "filename": "", "pid": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1"},
-      {"resource_type": "LandingPage", "filename": "", "pid": "https://hdl.handle.net/11234/1-3698"}
-    ], 
-    "description": "Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \\r\\n\\r\\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).", 
-    "license": "http://creativecommons.org/licenses/by-nc-sa/4.0/"
-}
+```Python
+FetchResult(
+    authors=[''], 
+    description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).', 
+    license='http://creativecommons.org/licenses/by-nc-sa/4.0/', 
+    ref_files=[ReferencedResources(resource_type='Metadata', ref_resources=[]), 
+               ReferencedResources(resource_type='Resource', ref_resources=[ReferencedResource(pid='https://wiki.korpus.cz/doku.php/en:cnk:etalon', data_type='text/html'), 
+                                                                            ReferencedResource(pid='https://lindat.mff.cuni.cz/repository//bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1', data_type='application/x-gzip')]), 
+               ReferencedResources(resource_type='SearchPage', ref_resources=[]), 
+               ReferencedResources(resource_type='LandingPage', ref_resources=[ReferencedResource(pid='https://hdl.handle.net/11234/1-3698', data_type='')]), 
+               ReferencedResources(resource_type='SearchService', ref_resources=[])], title='https://hdl.handle.net/11234/1-3698@format=cmdi')
 
 ```
 
@@ -69,17 +69,9 @@ By default, returns dictionary, if format=='jsons' returns a JSON string.
 
 returns:
 ```Python
-FetchResult(
-    authors=[''], 
-    description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).', 
-    license='http://creativecommons.org/licenses/by-nc-sa/4.0/', 
-    ref_files=[ReferencedResources(resource_type='Metadata', ref_resources=[]), 
-               ReferencedResources(resource_type='Resource', ref_resources=[ReferencedResource(pid='https://wiki.korpus.cz/doku.php/en:cnk:etalon', data_type='text/html'), 
-                                                                            ReferencedResource(pid='https://lindat.mff.cuni.cz/repository//bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1', data_type='application/x-gzip')]), 
-               ReferencedResources(resource_type='SearchPage', ref_resources=[]), 
-               ReferencedResources(resource_type='LandingPage', ref_resources=[ReferencedResource(pid='https://hdl.handle.net/11234/1-3698', data_type='')]), 
-               ReferencedResources(resource_type='SearchService', ref_resources=[])], title='https://hdl.handle.net/11234/1-3698@format=cmdi')
-
+IdentifyResult(description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).',
+               title='https://hdl.handle.net/11234/1-3698@format=cmdi', 
+               reverse_pid='https://hdl.handle.net/11234/1-3698@format=cmdi')
 ```
 
 #### is_host_registered(pid: str) -> bool
diff --git a/doglib/doglib.py b/doglib/doglib.py
@@ -189,18 +189,21 @@ def identify(self, pid_string: Union[str, PID]) -> dict:
                 return {}
             elif matching_repo is not None:
                 request_url: str = matching_repo.get_request_url(pid, self.secrets)
-                signpost_url = self._get_signpost_url(request_url)
-
-                if signpost_url:
-                    request_url = signpost_url
-                    final_url, response, response_headers = curl.get(request_url, follow_redirects=True)
-                    parser = matching_repo.get_parser("signpost")
-                else:
+                try:
+                    signpost_url = self._get_signpost_url(request_url)
+                    if signpost_url:
+                        request_url = signpost_url
+                        final_url, response, response_headers = curl.get(request_url, follow_redirects=True)
+                        parser = matching_repo.get_parser("signpost")
+                        return parser.identify(response)
+                    else:
+                        raise NoSignpostException("No signpost")
+                except:
                     request_headers: dict = matching_repo.get_headers(pid_factory(request_url))
                     final_url, response, response_headers = curl.get(request_url, request_headers,
                                                                      follow_redirects=True)
                     parser: Parser = matching_repo.get_parser()
-                return parser.identify(response)
+                    return parser.identify(response)
 
     def is_collection(self, pid_string: Union[str, PID]) -> bool:
         """