diff --git a/src/road_core_eval/query_rag.py b/src/road_core_eval/query_rag.py index 8bf8105..c07eb5b 100644 --- a/src/road_core_eval/query_rag.py +++ b/src/road_core_eval/query_rag.py @@ -29,6 +29,7 @@ def main(): + """Query RAG vector database""" parser = argparse.ArgumentParser( description="Utility script for querying RAG database" ) diff --git a/src/road_core_eval/response_evaluation.py b/src/road_core_eval/response_evaluation.py index 213666a..327785e 100644 --- a/src/road_core_eval/response_evaluation.py +++ b/src/road_core_eval/response_evaluation.py @@ -5,8 +5,8 @@ from collections import defaultdict from datetime import UTC, datetime from time import sleep -from httpx import Client from argparse import Namespace +from httpx import Client from pandas import DataFrame, concat, read_csv, read_parquet from tqdm import tqdm @@ -30,7 +30,7 @@ tqdm.pandas() -# TODO: OLS-712 Enrichment of Q+A pairs to contain questions with attachments +# TODO: OLS-712 Enrichment of Q+A pairs to contain questions with attachments pylint: disable=W0511 class ResponseEvaluation: """Evaluate LLM response.""" @@ -84,7 +84,7 @@ def _load_config_and_rag(self) -> None: # load rag index config.rag_index # pylint: disable=W0104 if config.rag_index is None: - raise Exception("No valid rag index for ols_rag mode") + raise ValueError("No valid rag index for ols_rag mode") def _load_qna_pool_parquet(self) -> DataFrame: """Load QnA pool from parquet file.""" @@ -149,6 +149,7 @@ def _get_inscope_qna(self, provider_model_id: str) -> DataFrame: qna_pool_df = qna_pool_df[qna_pool_df.in_use] return qna_pool_df.reset_index(drop=True).drop(columns="in_use") + # pylint: disable=R0913,R0917 def _get_api_response( self, question: str, @@ -173,7 +174,7 @@ def _get_api_response( self._api_client, ) break - except Exception: + except Exception: # pylint: disable=W0718 if retry_counter == retry_attempts - 1: raise # model is not realiable if it's overloaded, so take some time between requests @@ -185,6 +186,7 @@ def _get_api_response( ) return response + # pylint: disable=R0913,R0917 def _get_recent_response( self, question: str, @@ -274,7 +276,7 @@ def _get_response_with_score(self) -> DataFrame: f"{provider_model_id.replace('/', '-')}.csv" ) print("Temp score file exists. Proceeding without calculation.") - except Exception: + except Exception: # pylint: disable=W0718 print("Temp score doesn't exist. Proceeding with calculation.") qna_pool_df = self._get_inscope_qna(provider_model_id) qna_pool_df = self._get_model_response( diff --git a/src/road_core_eval/utils/models.py b/src/road_core_eval/utils/models.py index 496aff2..abfe95e 100644 --- a/src/road_core_eval/utils/models.py +++ b/src/road_core_eval/utils/models.py @@ -7,6 +7,7 @@ from ols.src.llms.providers.watsonx import Watsonx +# pylint: disable=R0903 class OpenAIVanilla(OpenAI): """OpenAI provider.""" @@ -30,6 +31,7 @@ def default_params(self): } +# pylint: disable=R0903 class AzureOpenAIVanilla(AzureOpenAI): """Azure OpenAI provider.""" @@ -68,6 +70,7 @@ def default_params(self): return default_parameters +# pylint: disable=R0903 class WatsonxVanilla(Watsonx): """Watsonx provider.""" diff --git a/src/road_core_eval/utils/prompts.py b/src/road_core_eval/utils/prompts.py index a09e6f3..63b6f26 100644 --- a/src/road_core_eval/utils/prompts.py +++ b/src/road_core_eval/utils/prompts.py @@ -7,6 +7,7 @@ You are a helpful assistant. """ +# pylint: disable=C0301 # Below is inspired by both ragas & langchain internal/example prompts. ANSWER_RELEVANCY_PROMPT = """You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer. Given the following answer delimited by three backticks please generate {num_questions} questions. diff --git a/src/road_core_eval/utils/relevancy_score.py b/src/road_core_eval/utils/relevancy_score.py index d44512a..2cec4b6 100644 --- a/src/road_core_eval/utils/relevancy_score.py +++ b/src/road_core_eval/utils/relevancy_score.py @@ -11,7 +11,7 @@ from road_core_eval.utils.prompts import ANSWER_RELEVANCY_PROMPT -class AnswerRelevancyScore: +class AnswerRelevancyScore: # pylint: disable=R0903 """Calculate response/answer relevancy score.""" def __init__(self, judge_llm, embedding_model): @@ -53,7 +53,7 @@ def get_score( gen_questions = "\n".join(gen_questions) break - except Exception as e: + except Exception as e: # pylint: disable=W0718 if retry_counter == retry_attempts - 1: print(f"error_answer_relevancy: {e}") score, valid_flag, gen_questions = None, None, None diff --git a/src/road_core_eval/utils/response.py b/src/road_core_eval/utils/response.py index 39402e3..03b1c1b 100644 --- a/src/road_core_eval/utils/response.py +++ b/src/road_core_eval/utils/response.py @@ -6,6 +6,7 @@ from ols import config from ols.constants import GenericLLMParameters from ols.src.prompts.prompt_generator import GeneratePrompt +from requests import HTTPError from road_core_eval.constants import REST_API_TIMEOUT from road_core_eval.utils.models import MODEL_OLS_PARAM, VANILLA_MODEL @@ -30,7 +31,7 @@ def get_model_response( timeout=REST_API_TIMEOUT, ) if response.status_code != 200: - raise Exception(response) + raise HTTPError(response=response) return response.json()["response"].strip() prompt = PromptTemplate.from_template("{query}") diff --git a/src/road_core_eval/utils/score.py b/src/road_core_eval/utils/score.py index 99631af..75ad0b7 100644 --- a/src/road_core_eval/utils/score.py +++ b/src/road_core_eval/utils/score.py @@ -12,7 +12,7 @@ from road_core_eval.utils.similarity_score_llm import AnswerSimilarityScore -class ResponseScore: +class ResponseScore: # pylint: disable=R0903 """Calculate response score.""" def __init__(self, eval_metrics: list, judge_provider: str, judge_model: str): @@ -39,7 +39,9 @@ def __init__(self, eval_metrics: list, judge_provider: str, judge_model: str): if "answer_similarity_llm" in judge_llm_required: self._llm_similarity_scorer = AnswerSimilarityScore(judge_llm) - def calculate_scores(self, query: str, answer: str, response: str) -> tuple: + def calculate_scores( # pylint: disable=R0914 + self, query: str, answer: str, response: str + ) -> tuple: """Calculate different similarity scores for two strings.""" res_vec = self._embedding_model.get_text_embedding(response) ans_vec = self._embedding_model.get_text_embedding(answer) diff --git a/src/road_core_eval/utils/similarity_score_llm.py b/src/road_core_eval/utils/similarity_score_llm.py index 7551387..96975fa 100644 --- a/src/road_core_eval/utils/similarity_score_llm.py +++ b/src/road_core_eval/utils/similarity_score_llm.py @@ -8,7 +8,7 @@ from road_core_eval.utils.prompts import ANSWER_SIMILARITY_PROMPT -class AnswerSimilarityScore: +class AnswerSimilarityScore: # pylint: disable=R0903 """Get similarity score generated by LLM.""" def __init__(self, judge_llm): @@ -16,6 +16,7 @@ def __init__(self, judge_llm): prompt = PromptTemplate.from_template(ANSWER_SIMILARITY_PROMPT) self._judge_llm = prompt | judge_llm + # pylint: disable=R0913,R0917 def get_score( self, question, @@ -37,7 +38,7 @@ def get_score( ) score = float(result.content) / 10 break - except Exception as e: + except Exception as e: # pylint: disable=W0718 if retry_counter == retry_attempts - 1: print(f"error_answer_relevancy: {e}") # Continue with score as None