diff --git a/HISTORY.rst b/HISTORY.rst index 0f821faf..4585966c 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -109,4 +109,8 @@ History 2.18.0 ------------------ * Add atmospherics and speaker_count support -* Deprecated support for Python versions up to 3.8 \ No newline at end of file +* Deprecated support for Python versions up to 3.8 + +2.19.0 +------------------ +* Add async translation and summarization diff --git a/README.md b/README.md index 3d8f71ef..5bb2f374 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,30 @@ and `custom_vocabulary_id` as optional parameters. The url submission option also supports authentication headers by using the `source_config` option. +You can request transcript summary. + +```python +# submitting a human transcription jobs +job = client.submit_job_url("https://example.com/file-to-transcribe.mp3", + language='en', + summarization_config=SummarizationOptions( + formatting_type=SummarizationFormattingOptions.BULLETS + )) +``` + +You can request transcript translation into up to five languages. + +```javascript +job = client.submit_job_url("https://example.com/file-to-transcribe.mp3", + language='en', + translation_config=TranslationOptions( + target_languages: [ + TranslationLanguageOptions("es", NlpModel.PREMIUM), + TranslationLanguageOptions("de") + ] + )); +``` + All options are described in the request body of the [Submit Job](https://docs.rev.ai/api/asynchronous/reference/#operation/SubmitTranscriptionJob) endpoint. @@ -131,6 +155,9 @@ transcript_json = client.get_transcript_json(job.id) # or as a python object transcript_object = client.get_transcript_object(job.id) + +# or if you requested transcript translation(s) +transcript_object = client.get_translated_transcript_object(job.id,'es') ``` Both the json and object forms contain all the formation outlined in the response @@ -138,6 +165,21 @@ of the [Get Transcript](https://docs.rev.ai/api/asynchronous/reference/#operatio when using the json response schema. While the text output is a string containing just the text of your transcript +### Getting transcript summary + +If you requested transcript summary, you can retrieve it as plain text or structured object: + +```python +# as text +summary = client.get_transcript_summary_text(job.id) + +# as json +summary = client.get_transcript_summary_json(job.id) + +# or as a python object +summary = client.get_transcript_summary_object(job.id) + +``` ### Getting captions output You can also get captions output from the SDK. We offer both SRT and VTT caption formats. @@ -145,6 +187,10 @@ If you submitted your job as speaker channel audio then you must also provide a ```python captions = client.get_captions(job.id, content_type=CaptionType.SRT, channel_id=None) + +# or if you requested transcript translation(s) +captions = client.get_translated_captions(job.id, 'es') + ``` ### Streamed outputs diff --git a/setup.cfg b/setup.cfg index f84ead4c..3cdec9af 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.18.0 +current_version = 2.19.0 commit = True tag = True diff --git a/src/rev_ai/__init__.py b/src/rev_ai/__init__.py index 6ee0399b..7da9b328 100644 --- a/src/rev_ai/__init__.py +++ b/src/rev_ai/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Top-level package for rev_ai""" -__version__ = '2.18.0' +__version__ = '2.19.0' from .models import Job, JobStatus, Account, Transcript, Monologue, Element, MediaConfig, \ CaptionType, CustomVocabulary, TopicExtractionJob, TopicExtractionResult, Topic, Informant, \ diff --git a/src/rev_ai/apiclient.py b/src/rev_ai/apiclient.py index ea94e809..0cc249d9 100644 --- a/src/rev_ai/apiclient.py +++ b/src/rev_ai/apiclient.py @@ -2,9 +2,13 @@ """Speech recognition tools for using Rev AI""" import json -from .models import Account, CaptionType, Job, Transcript -from .baseclient import BaseClient + from . import utils +from .baseclient import BaseClient +from .models import Account, CaptionType, Job, Transcript +from .models.asynchronous.summarization_options import SummarizationOptions +from .models.asynchronous.summary import Summary +from .models.asynchronous.translation_options import TranslationOptions try: from urllib.parse import urljoin @@ -66,7 +70,9 @@ def submit_job_url( notification_config=None, skip_postprocessing=False, remove_atmospherics=False, - speakers_count=None): + speakers_count=None, + summarization_config: SummarizationOptions = None, + translation_config: TranslationOptions = None): """Submit media given a URL for transcription. The audio data is downloaded from the URL :param media_url: web location of the media file @@ -116,6 +122,8 @@ def submit_job_url( :param remove_atmospherics: Atmospherics such as , , etc. will not appear in the transcript. :param speakers_count: Use to specify the total number of unique speakers in the audio. + :param summarization_config: Use to request transcript summary. + :param translation_config: Use to request transcript translation. :returns: raw response data :raises: HTTPError """ @@ -128,7 +136,9 @@ def submit_job_url( verbatim, rush, test_mode, segments_to_transcribe, speaker_names, source_config, notification_config, - skip_postprocessing) + skip_postprocessing, + summarization_config=summarization_config, + translation_config=translation_config) response = self._make_http_request( "POST", @@ -161,7 +171,9 @@ def submit_job_local_file( notification_config=None, skip_postprocessing=False, remove_atmospherics=False, - speakers_count=None): + speakers_count=None, + summarization_config: SummarizationOptions = None, + translation_config: TranslationOptions = None): """Submit a local file for transcription. Note that the content type is inferred if not provided. @@ -208,6 +220,8 @@ def submit_job_local_file( :param remove_atmospherics: Atmospherics such as , , etc. will not appear in the transcript. :param speakers_count: Use to specify the total number of unique speakers in the audio. + :param summarization_config: Use to request transcript summary. + :param translation_config: Use to request transcript translation. :returns: raw response data :raises: HTTPError, ValueError """ @@ -222,7 +236,9 @@ def submit_job_local_file( language, custom_vocabulary_id, transcriber, verbatim, rush, test_mode, segments_to_transcribe, speaker_names, None, - notification_config, skip_postprocessing) + notification_config, skip_postprocessing, + summarization_config=summarization_config, + translation_config=translation_config) with open(filename, 'rb') as f: files = { @@ -397,6 +413,28 @@ def get_captions(self, id_, content_type=CaptionType.SRT, channel_id=None): return response.text + def get_translated_captions(self, id_, language, content_type=CaptionType.SRT, channel_id=None): + """Get the captions output of a specific job and return it as plain text + + :param id_: id of job to be requested + :param content_type: caption type which should be returned. Defaults to SRT + :param channel_id: id of speaker channel to be captioned, only matters for multichannel jobs + :returns: caption data as text + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + query = self._create_captions_query(channel_id) + + response = self._make_http_request( + "GET", + urljoin(self.base_url, + 'jobs/{0}/captions/translation/{1}{2}'.format(id_, language, query)), + headers={'Accept': content_type.value} + ) + + return response.text + def get_captions_as_stream(self, id_, content_type=CaptionType.SRT, channel_id=None): """Get the captions output of a specific job and return it as a plain text stream @@ -420,6 +458,36 @@ def get_captions_as_stream(self, id_, content_type=CaptionType.SRT, channel_id=N return response + def get_translated_captions_as_stream( + self, + id_, + language, + content_type=CaptionType.SRT, + channel_id=None): + """Get the captions output of a specific job and return it as a plain text stream + + :param id_: id of job to be requested + :param language: requested translation language + :param content_type: caption type which should be returned. Defaults to SRT + :param channel_id: id of speaker channel to be captioned, only matters for multichannel jobs + :returns: requests.models.Response HTTP response which can be used to stream + the payload of the response + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + query = self._create_captions_query(channel_id) + + response = self._make_http_request( + "GET", + urljoin(self.base_url, + 'jobs/{0}/captions/translation/{1}{2}'.format(id_, language, query)), + headers={'Accept': content_type.value}, + stream=True + ) + + return response + def delete_job(self, id_): """Delete a specific transcription job All data related to the job, such as input media and transcript, will be permanently @@ -451,6 +519,160 @@ def get_account(self): return Account.from_json(response.json()) + def get_transcript_summary_text(self, id_): + """Get the transcript summary of a specific job as plain text. + + :param id_: id of job to be requested + :returns: transcript data as text + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)), + headers={'Accept': 'text/plain'} + ) + return response.text + + def get_transcript_summary_json(self, id_): + """Get the transcript summary of a specific job as json. + + :param id_: id of job to be requested + :returns: transcript data as json + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)), + headers={'Accept': 'application/json'} + ) + + return Summary.from_json(response.json()) + + def get_transcript_summary_json_as_stream(self, id_): + """Get the transcript summary of a specific job as streamed json. + + :param id_: id of job to be requested + :returns: requests.models.Response HTTP response which can be used to stream + the payload of the response + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)), + headers={'Accept': 'application/json'}, + stream=True + ) + + return response + + def get_translated_transcript_text(self, id_, language): + """Get the translated transcript of a specific job as plain text. + + :param id_: id of job to be requested + :param language: requested language + :returns: transcript data as text + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)), + headers={'Accept': 'text/plain'} + ) + + return response.text + + def get_translated_transcript_text_as_stream(self, id_, language): + """Get the translated transcript of a specific job as a plain text stream. + + :param id_: id of job to be requested + :param language: requested language + :returns: requests.models.Response HTTP response which can be used to stream + the payload of the response + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)), + headers={'Accept': 'text/plain'}, + stream=True + ) + + return response + + def get_translated_transcript_json(self, id_, language): + """Get the translated transcript of a specific job as json. + + :param id_: id of job to be requested + :param language: requested language + :returns: transcript data as json + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)), + headers={'Accept': self.rev_json_content_type} + ) + + return response.json() + + def get_translated_transcript_json_as_stream(self, id_, language): + """Get the translated transcript of a specific job as streamed json. + + :param id_: id of job to be requested + :param language: requested language + :returns: requests.models.Response HTTP response which can be used to stream + the payload of the response + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)), + headers={'Accept': self.rev_json_content_type}, + stream=True + ) + + return response + + def get_translated_transcript_object(self, id_, language): + """Get the translated transcript of a specific job as a python object`. + + :param id_: id of job to be requested + :param language: requested language + :returns: transcript data as a python object + :raises: HTTPError + """ + if not id_: + raise ValueError('id_ must be provided') + + response = self._make_http_request( + "GET", + urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)), + headers={'Accept': self.rev_json_content_type} + ) + + return Transcript.from_json(response.json()) + def _create_job_options_payload( self, media_url=None, @@ -475,7 +697,9 @@ def _create_job_options_payload( notification_config=None, skip_postprocessing=False, remove_atmospherics=None, - speakers_count=None): + speakers_count=None, + summarization_config: SummarizationOptions = None, + translation_config: TranslationOptions = None): payload = {} if media_url: payload['media_url'] = media_url @@ -512,7 +736,7 @@ def _create_job_options_payload( if segments_to_transcribe: payload['segments_to_transcribe'] = segments_to_transcribe if speaker_names: - payload['speaker_names'] =\ + payload['speaker_names'] = \ utils._process_speaker_names(speaker_names) if source_config: payload['source_config'] = source_config.to_dict() @@ -524,6 +748,10 @@ def _create_job_options_payload( payload['remove_atmospherics'] = remove_atmospherics if speakers_count: payload['speakers_count'] = speakers_count + if summarization_config: + payload['summarization_config'] = summarization_config.to_dict() + if translation_config: + payload['translation_config'] = translation_config.to_dict() return payload def _create_captions_query(self, speaker_channel): diff --git a/src/rev_ai/models/asynchronous/job.py b/src/rev_ai/models/asynchronous/job.py index 609833b1..e4c523ed 100644 --- a/src/rev_ai/models/asynchronous/job.py +++ b/src/rev_ai/models/asynchronous/job.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- """Job model""" - +from .summarization_options import Summarization from .job_status import JobStatus +from .translation_options import Translation class Job: @@ -28,7 +29,9 @@ def __init__( rush=None, segments_to_transcribe=None, remove_atmospherics=None, - speakers_count=None): + speakers_count=None, + summarization: Summarization = None, + translation: Translation = None): """ :param id_: unique id of job :param created_on: date and time at which this job was started @@ -85,6 +88,8 @@ def __init__( self.segments_to_transcribe = segments_to_transcribe self.remove_atmospherics = remove_atmospherics self.speakers_count = speakers_count + self.summarization = summarization + self.translation = translation def __eq__(self, other): """Override default equality operator""" @@ -120,5 +125,7 @@ def from_json(cls, json): rush=json.get('rush'), segments_to_transcribe=json.get('segments_to_transcribe'), remove_atmospherics=json.get('remove_atmospherics'), - speakers_count=json.get('speakers_count') + speakers_count=json.get('speakers_count'), + summarization=Summarization.from_json(json.get('summarization')), + translation=Translation.from_json(json.get('translation')) ) diff --git a/src/rev_ai/models/asynchronous/summarization_formatting_options.py b/src/rev_ai/models/asynchronous/summarization_formatting_options.py new file mode 100644 index 00000000..b48aa251 --- /dev/null +++ b/src/rev_ai/models/asynchronous/summarization_formatting_options.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class SummarizationFormattingOptions(str, Enum): + """Summarization formatting options.""" + PARAGRAPH = "paragraph" + BULLETS = "bullets" diff --git a/src/rev_ai/models/asynchronous/summarization_job_status.py b/src/rev_ai/models/asynchronous/summarization_job_status.py new file mode 100644 index 00000000..7f25a860 --- /dev/null +++ b/src/rev_ai/models/asynchronous/summarization_job_status.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +from enum import Enum + + +class SummarizationJobStatus(str, Enum): + """Enum for Summarization Job statuses""" + IN_PROGRESS = "in_progress" + FAILED = "failed" + COMPLETED = "completed" diff --git a/src/rev_ai/models/asynchronous/summarization_options.py b/src/rev_ai/models/asynchronous/summarization_options.py new file mode 100644 index 00000000..76f078c8 --- /dev/null +++ b/src/rev_ai/models/asynchronous/summarization_options.py @@ -0,0 +1,60 @@ +from .summarization_formatting_options import SummarizationFormattingOptions +from .summarization_job_status import SummarizationJobStatus +from ..nlp_model import NlpModel + + +class SummarizationOptions: + """Summarization request options.""" + + def __init__( + self, + prompt: str = None, + model: NlpModel = None, + formatting_type: SummarizationFormattingOptions = None): + self.prompt = prompt + self.model = model + self.type = formatting_type + + def to_dict(self): + """Returns the raw form of the url data object as the api + expects them""" + dict_result = {} + if self.prompt: + dict_result['prompt'] = self.prompt + if self.model: + dict_result['model'] = self.model + if self.type: + dict_result['type'] = self.type + + return dict_result + + +class Summarization(SummarizationOptions): + """Summarization options.""" + + def __init__( + self, + prompt: str = None, + model: NlpModel = None, + formatting_type: SummarizationFormattingOptions = None, + status: SummarizationJobStatus = None, + completed_on: str = None, + failure: str = None): + super().__init__(prompt, model, formatting_type) + self.status = status + self.completed_on = completed_on + self.failure = failure + + @classmethod + def from_json(cls, json): + if json is None: + return None + + return cls( + json.get('prompt'), + json.get('model'), + json.get('type'), + json.get('status'), + json.get('completed_on'), + json.get('failure') + ) diff --git a/src/rev_ai/models/asynchronous/summary.py b/src/rev_ai/models/asynchronous/summary.py new file mode 100644 index 00000000..de2cacf5 --- /dev/null +++ b/src/rev_ai/models/asynchronous/summary.py @@ -0,0 +1,18 @@ +from typing import List, Dict + + +class Summary: + """Transcript summary model.""" + def __init__( + self, + summary: str, + bullet_points: List[str]): + self.summary = summary + self.bullet_points = bullet_points + + @classmethod + def from_json(cls, json): + return cls( + json.get('summary'), + json.get('bullet_points') + ) diff --git a/src/rev_ai/models/asynchronous/translation_job_status.py b/src/rev_ai/models/asynchronous/translation_job_status.py new file mode 100644 index 00000000..490ef2b2 --- /dev/null +++ b/src/rev_ai/models/asynchronous/translation_job_status.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +from enum import Enum + + +class TranslationJobStatus(str, Enum): + """Enum for Translation Job statuses""" + IN_PROGRESS = "in_progress" + FAILED = "failed" + COMPLETED = "completed" diff --git a/src/rev_ai/models/asynchronous/translation_language_options.py b/src/rev_ai/models/asynchronous/translation_language_options.py new file mode 100644 index 00000000..eb36b17a --- /dev/null +++ b/src/rev_ai/models/asynchronous/translation_language_options.py @@ -0,0 +1,47 @@ +from .summarization_job_status import SummarizationJobStatus +from ..nlp_model import NlpModel + + +class TranslationLanguageOptions: + """Translation language request options.""" + def __init__( + self, + language: str = None, + model: NlpModel = None): + self.language = language + self.model = model + + def to_dict(self): + """Returns the raw form of the url data object as the api + expects them""" + dict_result = {} + if self.language: + dict_result['language'] = self.language + if self.model: + dict_result['model'] = self.model + + return dict_result + + +class TranslationLanguage(TranslationLanguageOptions): + """Translation language options.""" + def __init__( + self, + language: str = None, + model: NlpModel = None, + status: SummarizationJobStatus = None, + failure: str = None): + super().__init__(language, model) + self.status = status + self.failure = failure + + @classmethod + def from_json(cls, json): + if json is None: + return None + return cls( + json.get('language'), + json.get('model'), + json.get('status'), + json.get('failure') + ) diff --git a/src/rev_ai/models/asynchronous/translation_options.py b/src/rev_ai/models/asynchronous/translation_options.py new file mode 100644 index 00000000..deb21e67 --- /dev/null +++ b/src/rev_ai/models/asynchronous/translation_options.py @@ -0,0 +1,36 @@ +from typing import List, Dict +from .translation_language_options import TranslationLanguageOptions, TranslationLanguage + + +class TranslationOptions: + """Translation request options.""" + def __init__( + self, + target_languages: List[TranslationLanguageOptions]): + self.target_languages = target_languages + + def to_dict(self): + """Returns the raw form of the url data object as the api + expects them""" + dict_result = {"target_languages": [tl.to_dict() for tl in self.target_languages]} + + return dict_result + + +class Translation(TranslationOptions): + """Translation options.""" + def __init__( + self, + target_languages: List[TranslationLanguageOptions], + completed_on: str = None): + super().__init__(target_languages) + self.completed_on = completed_on + + @classmethod + def from_json(cls, json): + if json is None: + return None + return cls( + [TranslationLanguage.from_json(tl) for tl in json.get('target_languages')], + json.get('completed_on') + ) diff --git a/src/rev_ai/models/nlp_model.py b/src/rev_ai/models/nlp_model.py new file mode 100644 index 00000000..a30984b8 --- /dev/null +++ b/src/rev_ai/models/nlp_model.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class NlpModel(str, Enum): + STANDARD = "standard" + PREMIUM = "premium" diff --git a/tests/test_async_captions_translation.py b/tests/test_async_captions_translation.py new file mode 100644 index 00000000..ea9093af --- /dev/null +++ b/tests/test_async_captions_translation.py @@ -0,0 +1,60 @@ +import json + +import pytest + +from src.rev_ai import JobStatus +from src.rev_ai.apiclient import RevAiAPIClient +from src.rev_ai.models.asynchronous.summarization_formatting_options import SummarizationFormattingOptions +from src.rev_ai.models.asynchronous.summarization_job_status import SummarizationJobStatus +from src.rev_ai.models.asynchronous.summarization_options import SummarizationOptions +from src.rev_ai.models.asynchronous.translation_job_status import TranslationJobStatus +from src.rev_ai.models.asynchronous.translation_language_options import TranslationLanguageOptions +from src.rev_ai.models.asynchronous.translation_options import TranslationOptions +from src.rev_ai.models.nlp_model import NlpModel + +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin + +TOKEN = "token" +JOB_ID = '1' +JOB_ID_URL = urljoin(RevAiAPIClient.base_url, 'jobs/{}'.format(JOB_ID)) +JOBS_URL = urljoin(RevAiAPIClient.base_url, 'jobs') + + +@pytest.mark.usefixtures('mock_session', 'make_mock_response') +class TestAsyncCaptionsTranslation(): + + def test_get_translated_captions(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/captions/translation/{}'.format(JOB_ID_URL, "es") + response = make_mock_response(url=url, text='es captions') + mock_session.request.return_value = response + + captions = client.get_translated_captions(JOB_ID, "es") + assert captions == 'es captions' + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/x-subrip'})) + + def test_get_translated_captions_as_stream(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/captions/translation/{}'.format(JOB_ID_URL, "es") + response = make_mock_response(url=url, text='es captions') + mock_session.request.return_value = response + + captions = client.get_translated_captions_as_stream(JOB_ID, "es") + assert captions.content == 'es captions' + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/x-subrip'}), stream=True) + + @staticmethod + def hdr_fixture(client: RevAiAPIClient, additional_headers): + hdr = {} + hdr.update(client.default_headers) + hdr.update(additional_headers) + return hdr diff --git a/tests/test_async_summarization.py b/tests/test_async_summarization.py new file mode 100644 index 00000000..35efb6e4 --- /dev/null +++ b/tests/test_async_summarization.py @@ -0,0 +1,210 @@ +import json + +import pytest + +from src.rev_ai.apiclient import RevAiAPIClient +from src.rev_ai.models.asynchronous.summarization_formatting_options import SummarizationFormattingOptions +from src.rev_ai.models.asynchronous.summarization_job_status import SummarizationJobStatus +from src.rev_ai.models.asynchronous.summarization_options import SummarizationOptions +from src.rev_ai.models.nlp_model import NlpModel + +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin + +TOKEN = "token" +JOB_ID = '1' +JOB_ID_URL = urljoin(RevAiAPIClient.base_url, 'jobs/{}'.format(JOB_ID)) +JOBS_URL = urljoin(RevAiAPIClient.base_url, 'jobs') +JOB_TRANSCRIPT_SUMMARY_URL = '{}/transcript/summary'.format(JOB_ID_URL) + + +@pytest.mark.usefixtures('mock_session', 'make_mock_response') +class TestAsyncSummarization(): + + def test_submit_local_file(self, mocker, mock_session, make_mock_response): + status = 'transcribed' + created_on = '2018-05-05T23:23:22.29Z' + completed_on = '2018-05-05T23:23:22.30Z' + data = { + 'id': JOB_ID, + 'created_on': created_on, + 'status': status, + 'summarization': { + 'prompt': 'Try to summarize this transcript as good as you possibly can', + 'model': 'premium', + 'type': 'bullets', + 'status': 'completed', + 'completed_on': completed_on + } + } + response = make_mock_response(url=JOB_ID_URL, json_data=data) + mock_session.request.return_value = response + client = RevAiAPIClient(TOKEN) + + with mocker.patch('src.rev_ai.apiclient.open', create=True)() as file: + job = client.submit_job_local_file('test_mp3.mp3', + language="en", + summarization_config=SummarizationOptions( + prompt="Try to summarize this transcript as good as you possibly can", + model=NlpModel.PREMIUM, + formatting_type=SummarizationFormattingOptions.BULLETS + + )) + mock_session.request.assert_called_once_with( + "POST", + JOBS_URL, + files={ + 'media': ('test_mp3.mp3', file), + 'options': ( + None, + json.dumps({ + 'language': 'en', + 'summarization_config': { + 'prompt': "Try to summarize this transcript as good as you possibly can", + 'model': 'premium', + 'type': 'bullets' + } + }, sort_keys=True) + ) + }, + headers=client.default_headers + ) + + assert job.summarization is not None + assert job.summarization.model == NlpModel.PREMIUM + assert job.summarization.type == SummarizationFormattingOptions.BULLETS + assert job.summarization.prompt == "Try to summarize this transcript as good as you possibly can" + + def test_submit_source_url(self, mock_session, make_mock_response): + status = 'transcribed' + created_on = '2018-05-05T23:23:22.29Z' + completed_on = '2018-05-05T23:23:22.30Z' + data = { + 'id': JOB_ID, + 'status': status, + 'created_on': created_on, + 'summarization': { + 'prompt': 'Try to summarize this transcript as good as you possibly can', + 'model': 'premium', + 'type': 'bullets', + 'status': 'completed', + 'completed_on': completed_on + } + } + response = make_mock_response(url=JOB_ID_URL, json_data=data) + mock_session.request.return_value = response + + client = RevAiAPIClient(TOKEN) + + job = client.submit_job_url('https://example.com/test.mp3', + language="en", + summarization_config=SummarizationOptions( + "Try to summarize this transcript as good as you possibly can", + NlpModel.PREMIUM, + SummarizationFormattingOptions.BULLETS + + )) + mock_session.request.assert_called_once_with( + "POST", + JOBS_URL, + json={ + 'media_url': 'https://example.com/test.mp3', + 'language': 'en', + 'summarization_config': { + 'prompt': "Try to summarize this transcript as good as you possibly can", + 'model': 'premium', + 'type': 'bullets' + } + }, + headers=client.default_headers + ) + + assert job.summarization is not None + assert job.summarization.status == SummarizationJobStatus.COMPLETED + assert job.summarization.model == NlpModel.PREMIUM + assert job.summarization.type == SummarizationFormattingOptions.BULLETS + assert job.summarization.prompt == "Try to summarize this transcript as good as you possibly can" + + def test_get_transcript_summary_text(self, mock_session, make_mock_response): + url = JOB_TRANSCRIPT_SUMMARY_URL + client = RevAiAPIClient(TOKEN) + response = make_mock_response(url=url, text='transcript summary') + mock_session.request.return_value = response + + summary = client.get_transcript_summary_text(JOB_ID) + assert summary == 'transcript summary' + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'text/plain'}) + ) + + def test_get_transcript_summary_json_paragraph(self, mock_session, make_mock_response): + url = JOB_TRANSCRIPT_SUMMARY_URL + data = { + 'summary': 'transcript summary' + } + client = RevAiAPIClient(TOKEN) + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + summary = client.get_transcript_summary_json(JOB_ID) + assert summary.summary == 'transcript summary' + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/json'}) + ) + + def test_get_transcript_summary_json_paragraph_as_stream(self, mock_session, make_mock_response): + url = JOB_TRANSCRIPT_SUMMARY_URL + data = { + "summary": "transcript summary" + } + + client = RevAiAPIClient(TOKEN) + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + summary = client.get_transcript_summary_json_as_stream(JOB_ID) + + s = summary.content.decode('utf-8') + s = s.replace('\'', '"') + + assert json.loads(s) == data + + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/json'}), + stream=True + ) + + def test_get_transcript_summary_json_bullets(self, mock_session, make_mock_response): + url = JOB_TRANSCRIPT_SUMMARY_URL + data = { + 'bullet_points': ['bullet1', 'bullet2'] + } + client = RevAiAPIClient(TOKEN) + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + summary_json = client.get_transcript_summary_json(JOB_ID) + assert summary_json is not None + assert summary_json.bullet_points is not None + assert len(summary_json.bullet_points) == 2 + + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/json'}) + ) + + @staticmethod + def hdr_fixture(client: RevAiAPIClient, additional_headers): + hdr = {} + hdr.update(client.default_headers) + hdr.update(additional_headers) + return hdr diff --git a/tests/test_async_translation.py b/tests/test_async_translation.py new file mode 100644 index 00000000..d9d6f59e --- /dev/null +++ b/tests/test_async_translation.py @@ -0,0 +1,250 @@ +import json + +import pytest + +from src.rev_ai import JobStatus +from src.rev_ai.apiclient import RevAiAPIClient +from src.rev_ai.models.asynchronous.summarization_formatting_options import SummarizationFormattingOptions +from src.rev_ai.models.asynchronous.summarization_job_status import SummarizationJobStatus +from src.rev_ai.models.asynchronous.summarization_options import SummarizationOptions +from src.rev_ai.models.asynchronous.translation_job_status import TranslationJobStatus +from src.rev_ai.models.asynchronous.translation_language_options import TranslationLanguageOptions +from src.rev_ai.models.asynchronous.translation_options import TranslationOptions +from src.rev_ai.models.nlp_model import NlpModel + +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin + +TOKEN = "token" +JOB_ID = '1' +JOB_ID_URL = urljoin(RevAiAPIClient.base_url, 'jobs/{}'.format(JOB_ID)) +JOBS_URL = urljoin(RevAiAPIClient.base_url, 'jobs') + + +@pytest.mark.usefixtures('mock_session', 'make_mock_response') +class TestAsyncTranslation(): + + def test_submit_local_file(self, mocker, mock_session, make_mock_response): + created_on = '2018-05-05T23:23:22.30Z' + completed_on = '2018-05-05T23:24:22.30Z' + status = 'transcribed' + data = { + 'id': JOB_ID, + 'created_on': created_on, + 'status': status, + 'translation': { + 'target_languages': [ + { + 'language': 'es', + 'model': 'premium', + 'status': 'completed' + }, + { + 'language': 'ru', + 'model': 'premium', + 'status': 'completed' + } + ], + 'completed_on': completed_on + } + } + response = make_mock_response(url=JOB_ID_URL, json_data=data) + mock_session.request.return_value = response + client = RevAiAPIClient(TOKEN) + + with mocker.patch('src.rev_ai.apiclient.open', create=True)() as file: + job = client.submit_job_local_file('test_mp3.mp3', + language="en", + translation_config=TranslationOptions( + target_languages=[ + TranslationLanguageOptions("es", NlpModel.PREMIUM), + TranslationLanguageOptions("ru") + ] + )) + mock_session.request.assert_called_once_with( + "POST", + JOBS_URL, + files={ + 'media': ('test_mp3.mp3', file), + 'options': ( + None, + json.dumps({ + 'language': 'en', + 'translation_config': { + 'target_languages': [ + { + 'language': 'es', + 'model': 'premium' + }, + { + 'language': 'ru' + } + + ]} + }, sort_keys=True) + ) + }, + headers=client.default_headers + ) + assert job.translation is not None + assert job.translation.target_languages is not None + assert len(job.translation.target_languages) == 2 + + assert job.translation.completed_on is not None + assert job.translation.target_languages[0].status == TranslationJobStatus.COMPLETED + assert job.translation.target_languages[0].language == "es" + assert job.translation.target_languages[0].model == NlpModel.PREMIUM + + assert job.translation.target_languages[1].status, TranslationJobStatus.COMPLETED + assert job.translation.target_languages[1].language, "ru" + + def test_submit_source_url(self, mock_session, make_mock_response): + status = 'transcribed' + created_on = '2018-05-05T23:23:22.29Z' + completed_on = '2018-05-05T23:23:22.30Z' + data = { + 'id': JOB_ID, + 'status': status, + 'created_on': created_on, + 'summarization': { + 'prompt': 'Try to summarize this transcript as good as you possibly can', + 'model': 'premium', + 'type': 'bullets', + 'status': 'completed', + 'completed_on': completed_on + }, + 'translation': { + 'target_languages': [ + { + 'language': 'es', + 'model': 'premium', + 'status': 'completed' + }, + { + 'language': 'ru', + 'model': 'premium', + 'status': 'completed' + } + ], + 'completed_on': completed_on + } + } + response = make_mock_response(url=JOB_ID_URL, json_data=data) + mock_session.request.return_value = response + + client = RevAiAPIClient(TOKEN) + + job = client.submit_job_url('https://example.com/test.mp3', + language="en", + translation_config=TranslationOptions( + target_languages=[ + TranslationLanguageOptions("es", NlpModel.PREMIUM), + TranslationLanguageOptions("ru") + ] + ) + ) + mock_session.request.assert_called_once_with( + "POST", + JOBS_URL, + json={ + 'media_url': 'https://example.com/test.mp3', + 'language': 'en', + 'translation_config': { + 'target_languages': [ + { + 'language': 'es', + 'model': 'premium' + }, + { + 'language': 'ru' + } + + ]} + }, + headers=client.default_headers + ) + assert job.translation is not None + assert job.translation.target_languages is not None + assert len(job.translation.target_languages) == 2 + + assert job.translation.completed_on is not None + assert job.translation.target_languages[0].status == TranslationJobStatus.COMPLETED + assert job.translation.target_languages[0].language == "es" + assert job.translation.target_languages[0].model == NlpModel.PREMIUM + + assert job.translation.target_languages[1].status, TranslationJobStatus.COMPLETED + assert job.translation.target_languages[1].language, "ru" + + def test_get_translated_transcript_text(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/transcript/translation/{}'.format(JOB_ID_URL, "es") + response = make_mock_response(url=url, text='es transcript') + mock_session.request.return_value = response + + translation = client.get_translated_transcript_text(JOB_ID, "es") + assert translation == 'es transcript' + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'text/plain'})) + + def test_get_translated_transcript_json(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/transcript/translation/{}'.format(JOB_ID_URL, "es") + data = { + 'monologues': [] + } + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + translation = client.get_translated_transcript_json(JOB_ID, "es") + assert translation == data + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/vnd.rev.transcript.v1.0+json'})) + + def test_get_translated_transcript_json_as_stream(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/transcript/translation/{}'.format(JOB_ID_URL, "es") + data = { + 'monologues': [] + } + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + translation = client.get_translated_transcript_json_as_stream(JOB_ID, "es") + s = translation.content.decode('utf-8') + s = s.replace('\'', '"') + assert json.loads(s) == data + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/vnd.rev.transcript.v1.0+json'}), stream=True) + + def test_get_translated_transcript_object(self, mock_session, make_mock_response): + client = RevAiAPIClient(TOKEN) + url = '{}/transcript/translation/{}'.format(JOB_ID_URL, "es") + data = { + 'monologues': [ + {'speaker': 123} + ] + } + response = make_mock_response(url=url, json_data=data) + mock_session.request.return_value = response + + translation = client.get_translated_transcript_object(JOB_ID, "es") + assert translation.monologues[0].speaker == 123 + mock_session.request.assert_called_once_with( + "GET", + url, + headers=self.hdr_fixture(client, {'Accept': 'application/vnd.rev.transcript.v1.0+json'})) + + @staticmethod + def hdr_fixture(client: RevAiAPIClient, additional_headers): + hdr = {} + hdr.update(client.default_headers) + hdr.update(additional_headers) + return hdr