Skip to content

Commit 88998b4

Browse files
committed
hotfix identify, update README with dataclass outputs
1 parent 09007b6 commit 88998b4

2 files changed

Lines changed: 25 additions & 30 deletions

File tree

README.md

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,17 @@ By default, returns dictionary, if format=='jsons' returns a JSON string.
4343
```
4444

4545
returns:
46-
```JSON
47-
{
48-
"ref_files":
49-
[
50-
{"resource_type": "Resource", "filename": "", "pid": "https://wiki.korpus.cz/doku.php/en:cnk:etalon"},
51-
{"resource_type": "Resource", "filename": "", "pid": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1"},
52-
{"resource_type": "LandingPage", "filename": "", "pid": "https://hdl.handle.net/11234/1-3698"}
53-
],
54-
"description": "Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \\r\\n\\r\\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).",
55-
"license": "http://creativecommons.org/licenses/by-nc-sa/4.0/"
56-
}
46+
```Python
47+
FetchResult(
48+
authors=[''],
49+
description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).',
50+
license='http://creativecommons.org/licenses/by-nc-sa/4.0/',
51+
ref_files=[ReferencedResources(resource_type='Metadata', ref_resources=[]),
52+
ReferencedResources(resource_type='Resource', ref_resources=[ReferencedResource(pid='https://wiki.korpus.cz/doku.php/en:cnk:etalon', data_type='text/html'),
53+
ReferencedResource(pid='https://lindat.mff.cuni.cz/repository//bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1', data_type='application/x-gzip')]),
54+
ReferencedResources(resource_type='SearchPage', ref_resources=[]),
55+
ReferencedResources(resource_type='LandingPage', ref_resources=[ReferencedResource(pid='https://hdl.handle.net/11234/1-3698', data_type='')]),
56+
ReferencedResources(resource_type='SearchService', ref_resources=[])], title='https://hdl.handle.net/11234/1-3698@format=cmdi')
5757

5858
```
5959

@@ -69,17 +69,9 @@ By default, returns dictionary, if format=='jsons' returns a JSON string.
6969

7070
returns:
7171
```Python
72-
FetchResult(
73-
authors=[''],
74-
description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).',
75-
license='http://creativecommons.org/licenses/by-nc-sa/4.0/',
76-
ref_files=[ReferencedResources(resource_type='Metadata', ref_resources=[]),
77-
ReferencedResources(resource_type='Resource', ref_resources=[ReferencedResource(pid='https://wiki.korpus.cz/doku.php/en:cnk:etalon', data_type='text/html'),
78-
ReferencedResource(pid='https://lindat.mff.cuni.cz/repository//bitstream/handle/11234/1-3698/Etalon.tgz?sequence=1', data_type='application/x-gzip')]),
79-
ReferencedResources(resource_type='SearchPage', ref_resources=[]),
80-
ReferencedResources(resource_type='LandingPage', ref_resources=[ReferencedResource(pid='https://hdl.handle.net/11234/1-3698', data_type='')]),
81-
ReferencedResources(resource_type='SearchService', ref_resources=[])], title='https://hdl.handle.net/11234/1-3698@format=cmdi')
82-
72+
IdentifyResult(description='Etalon is a manually annotated corpus of contemporary Czech. The corpus contains 1,885,589 words (2,265,722 tokens) and is annotated in the same way as SYN2020 of the Czech National Corpus. The corpus includes fiction (ca 24%), professional and scientific literature (ca 40%) and newspapers (ca 36%). \r\n\r\nThe corpus is provided in a vertical format, where sentence boundaries are marked with a blank line. Every word form is written on a separate line, followed by five tab-separated attributes: syntactic word, lemma, sublemma, tag and verbtag. The texts are shuffled in random chunks of 100 words at maximum (respecting sentence boundaries).',
73+
title='https://hdl.handle.net/11234/1-3698@format=cmdi',
74+
reverse_pid='https://hdl.handle.net/11234/1-3698@format=cmdi')
8375
```
8476

8577
#### is_host_registered(pid: str) -> bool

doglib/doglib.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,18 +189,21 @@ def identify(self, pid_string: Union[str, PID]) -> dict:
189189
return {}
190190
elif matching_repo is not None:
191191
request_url: str = matching_repo.get_request_url(pid, self.secrets)
192-
signpost_url = self._get_signpost_url(request_url)
193-
194-
if signpost_url:
195-
request_url = signpost_url
196-
final_url, response, response_headers = curl.get(request_url, follow_redirects=True)
197-
parser = matching_repo.get_parser("signpost")
198-
else:
192+
try:
193+
signpost_url = self._get_signpost_url(request_url)
194+
if signpost_url:
195+
request_url = signpost_url
196+
final_url, response, response_headers = curl.get(request_url, follow_redirects=True)
197+
parser = matching_repo.get_parser("signpost")
198+
return parser.identify(response)
199+
else:
200+
raise NoSignpostException("No signpost")
201+
except:
199202
request_headers: dict = matching_repo.get_headers(pid_factory(request_url))
200203
final_url, response, response_headers = curl.get(request_url, request_headers,
201204
follow_redirects=True)
202205
parser: Parser = matching_repo.get_parser()
203-
return parser.identify(response)
206+
return parser.identify(response)
204207

205208
def is_collection(self, pid_string: Union[str, PID]) -> bool:
206209
"""

0 commit comments

Comments
 (0)