From adb03778420f4de914ce11a43b5cba8796aef09b Mon Sep 17 00:00:00 2001
From: Tom Searle
-Further information about MedCAT can be found from their [github](https://github.com/CogStack/MedCAT)
+Further information about MedCAT can be found from their [github](https://github.com/CogStack/MedCAT)
or via their official documentation [here](https://medcat.readthedocs.io/en/latest/).
General MedCAT tutorials can be found [here](https://github.com/CogStack/MedCATtutorials).
diff --git a/cogstack.py b/cogstack.py
index 070c84d..c586c85 100644
--- a/cogstack.py
+++ b/cogstack.py
@@ -1,7 +1,5 @@
import getpass
from typing import Dict, List, Any, Optional, Iterable, Tuple
-import elasticsearch
-import elasticsearch.helpers
import pandas as pd
from tqdm.notebook import tqdm
import eland as ed
@@ -11,52 +9,83 @@
from credentials import *
+# Import search engine clients
+try:
+ import elasticsearch
+ import elasticsearch.helpers
+ ELASTICSEARCH_AVAILABLE = True
+except ImportError:
+ ELASTICSEARCH_AVAILABLE = False
+
+try:
+ import opensearchpy
+ import opensearchpy.helpers
+ OPENSEARCH_AVAILABLE = True
+except ImportError:
+ OPENSEARCH_AVAILABLE = False
+
class CogStack(object):
"""
- A class for interacting with Elasticsearch.
-
+ A class for interacting with Elasticsearch or OpenSearch.
+
Args:
- hosts (List[str]): A list of Elasticsearch host URLs.
- username (str, optional): The username to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a username.
- password (str, optional): The password to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a password.
- api (bool, optional): A boolean value indicating whether to use API keys or basic authentication to connect to Elasticsearch. Defaults to False (i.e., use basic authentication). Elasticsearch 7.17.
- api_key (str, optional): The API key to use when connecting to Elasticsearch.
+ hosts (List[str]): A list of search engine host URLs.
+ username (str, optional): The username to use when connecting to the search engine. If not provided, the user will be prompted to enter a username.
+ password (str, optional): The password to use when connecting to the search engine. If not provided, the user will be prompted to enter a password.
+ api (bool, optional): A boolean value indicating whether to use API keys or basic authentication to connect to the search engine. Defaults to False (i.e., use basic authentication). Elasticsearch 7.17.
+ api_key (str, optional): The API key to use when connecting to the search engine.
When provided along with `api=True`, this takes precedence over username/password. Only available when using Elasticsearch 8.17.
+ use_opensearch (bool, optional): A boolean value indicating whether to use OpenSearch instead of Elasticsearch. Defaults to False (i.e., use Elasticsearch).
+ timeout (int, optional): The timeout in seconds for connections. Defaults to 60.
"""
def __init__(self, hosts: List, username: Optional[str] = None, password: Optional[str] = None,
- api: bool = False, timeout: Optional[int]=60, api_key: Optional[str] = None):
+ api: bool = False, timeout: Optional[int]=60, api_key: Optional[str] = None,
+ use_opensearch: bool = False):
+
+ # Validate that the required client is available
+ if use_opensearch and not OPENSEARCH_AVAILABLE:
+ raise ImportError("OpenSearch client is not available. Please install opensearch-py: pip install opensearch-py")
+ elif not use_opensearch and not ELASTICSEARCH_AVAILABLE:
+ raise ImportError("Elasticsearch client is not available. Please install elasticsearch: pip install elasticsearch")
+
+ # Choose the appropriate client and helpers
+ if use_opensearch:
+ client_class = opensearchpy.OpenSearch
+ self.helpers = opensearchpy.helpers
+ else:
+ client_class = elasticsearch.Elasticsearch
+ self.helpers = elasticsearch.helpers
if api_key and api:
- self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
- api_key=api_key,
- verify_certs=False,
- timeout=timeout)
-
-
+ self.elastic = client_class(hosts=hosts,
+ api_key=api_key,
+ verify_certs=False,
+ timeout=timeout)
+
elif api:
api_username, api_password = self._check_auth_details(username, password)
- self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
- api_key=(api_username, api_password),
- verify_certs=False,
- timeout=timeout)
-
+ self.elastic = client_class(hosts=hosts,
+ api_key=(api_username, api_password),
+ verify_certs=False,
+ timeout=timeout)
+
else:
username, password = self._check_auth_details(username, password)
- self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
- basic_auth=(username, password),
- verify_certs=False,
- timeout=timeout)
+ self.elastic = client_class(hosts=hosts,
+ basic_auth=(username, password),
+ verify_certs=False,
+ timeout=timeout)
def _check_auth_details(self, username=None, password=None) -> Tuple[str, str]:
"""
Prompt the user for a username and password if the values are not provided as function arguments.
-
+
Args:
api_username (str, optional): The API username. If not provided, the user will be prompted to enter a username.
api_password (str, optional): The API password. If not provided, the user will be prompted to enter a password.
-
+
Returns:
Tuple[str, str]: A tuple containing the API username and password.
"""
@@ -68,18 +97,18 @@ def _check_auth_details(self, username=None, password=None) -> Tuple[str, str]:
def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, request_timeout: Optional[int] = 300):
"""
- Retrieve a generator object that can be used to iterate through documents in an Elasticsearch index.
-
+ Retrieve a generator object that can be used to iterate through documents in an Elasticsearch or OpenSearch index.
+
Args:
- index (List[str]): A list of Elasticsearch index names to search.
+ index (List[str]): A list of search engine index names to search.
query (Dict): A dictionary containing the search query parameters.
es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800.
- request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300.
+ request_timeout (int, optional): The time in seconds to wait for a response from the search engine before timing out. Defaults to 300.
Returns:
- generator: A generator object that can be used to iterate through the documents in the specified Elasticsearch index.
+ generator: A generator object that can be used to iterate through the documents in the specified search engine index.
"""
- docs_generator = elasticsearch.helpers.scan(self.elastic,
+ docs_generator = self.helpers.scan(self.elastic,
query=query,
index=index,
size=es_gen_size,
@@ -89,20 +118,20 @@ def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, req
def cogstack2df(self, query: Dict, index: str, column_headers=None, es_gen_size: int=800, request_timeout: int=300,
show_progress: bool = True):
"""
- Retrieve documents from an Elasticsearch index and convert them to a Pandas DataFrame.
-
+ Retrieve documents from an Elasticsearch or OpenSearch index and convert them to a Pandas DataFrame.
+
Args:
query (Dict): A dictionary containing the search query parameters.
- index (str): The name of the Elasticsearch index to search.
+ index (str): The name of the search engine index to search.
column_headers (List[str], optional): A list of column headers to use for the DataFrame. If not provided, the DataFrame will have default column names.
es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800.
- request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300.
+ request_timeout (int, optional): The time in seconds to wait for a response from the search engine before timing out. Defaults to 300.
show_progress (bool, optional): Whether to show the progress in console. Defaults to true.
Returns:
pandas.DataFrame: A DataFrame containing the retrieved documents.
"""
- docs_generator = elasticsearch.helpers.scan(self.elastic,
+ docs_generator = self.helpers.scan(self.elastic,
query=query,
index=index,
size=es_gen_size,
@@ -123,15 +152,15 @@ def cogstack2df(self, query: Dict, index: str, column_headers=None, es_gen_size:
else:
df = pd.DataFrame(temp_results)
return df
-
+
def DataFrame(self, index: str, columns: Optional[List[str]] = None):
"""
Fast method to return a pandas dataframe from a CogStack search.
-
+
Args:
index (str): A list of indices to search.
columns (List[str], optional): A list of column names to include in the DataFrame. If not provided, all columns will be included.
-
+
Returns:
DataFrame: A pd.DataFrame like object containing the retrieved documents.
"""
@@ -141,11 +170,11 @@ def DataFrame(self, index: str, columns: Optional[List[str]] = None):
def list_chunker(user_list: List[Any], n: int) -> List[List[Any]]:
"""
Divide a list into sublists of a specified size.
-
+
Args:
user_list (List[Any]): The list to be divided.
n (int): The size of the sublists.
-
+
Returns:
List[List[Any]]: A list of sublists containing the elements of the input list.
"""
diff --git a/requirements.txt b/requirements.txt
index a14b289..25e332f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,7 @@ eland==8.12.1
en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl
ipyfilechooser
jupyter_contrib_nbextensions
+
+# Search engine clients (install at least one)
+elasticsearch>=8.0.0 # For Elasticsearch support
+opensearch-py>=2.0.0 # For OpenSearch support (alternative to elasticsearch)
From cc97e07b78d75bb4c5def863aa0e37d7e9e07bf2 Mon Sep 17 00:00:00 2001
From: Tom Searle