Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions common/chunkers/character_chunker.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from common.chunkers.base_chunker import BaseChunker

_DEFAULT_FALLBACK_SIZE = 4096


class CharacterChunker(BaseChunker):
def __init__(self, chunk_size=1024, overlap_size=0):
if chunk_size <= overlap_size:
raise ValueError("Chunk size must be larger than overlap size")
self.chunk_size = chunk_size
def __init__(self, chunk_size=0, overlap_size=0):
self.chunk_size = chunk_size if chunk_size > 0 else _DEFAULT_FALLBACK_SIZE
self.overlap_size = overlap_size

def chunk(self, input_string):
if self.chunk_size <= 0:
return []
if self.chunk_size <= self.overlap_size:
raise ValueError("Chunk size must be larger than overlap size")

chunks = []
i = 0
Expand Down
34 changes: 31 additions & 3 deletions common/chunkers/html_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from typing import Optional, List, Tuple
import re
from common.chunkers.base_chunker import BaseChunker
from common.chunkers.separators import TEXT_SEPARATORS
from langchain_text_splitters import HTMLSectionSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


_DEFAULT_FALLBACK_SIZE = 4096


class HTMLChunker(BaseChunker):
Expand All @@ -25,12 +30,20 @@ class HTMLChunker(BaseChunker):
- Automatically detects which headers (h1-h6) are present in the HTML
- Uses only the headers that exist in the document for optimal chunking
- If custom headers are provided, uses those instead of auto-detection
- Supports chunk_size / chunk_overlap: when chunk_size > 0, oversized
header-based chunks are further split with RecursiveCharacterTextSplitter
- When chunk_size is 0 (default), a fallback of 4096 is used so that
headerless HTML documents are still split into reasonable chunks
"""

def __init__(
self,
headers: Optional[List[Tuple[str, str]]] = None # e.g. [("h1", "Header 1"), ("h2", "Header 2")]
chunk_size: int = 0,
chunk_overlap: int = 0,
headers: Optional[List[Tuple[str, str]]] = None,
):
self.chunk_size = chunk_size if chunk_size > 0 else _DEFAULT_FALLBACK_SIZE
self.chunk_overlap = chunk_overlap
self.headers = headers

def _detect_headers(self, html_content: str) -> List[Tuple[str, str]]:
Expand Down Expand Up @@ -77,8 +90,23 @@ def chunk(self, input_string: str) -> List[str]:
splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_use)
docs = splitter.split_text(input_string)

# Extract text content from Document objects
return [doc.page_content for doc in docs]
initial_chunks = [doc.page_content for doc in docs]

if any(len(chunk) > self.chunk_size for chunk in initial_chunks):
recursive_splitter = RecursiveCharacterTextSplitter(
separators=TEXT_SEPARATORS,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
)
final_chunks = []
for chunk in initial_chunks:
if len(chunk) > self.chunk_size:
final_chunks.extend(recursive_splitter.split_text(chunk))
else:
final_chunks.append(chunk)
return final_chunks

return initial_chunks

def __call__(self, input_string: str) -> List[str]:
return self.chunk(input_string)
33 changes: 20 additions & 13 deletions common/chunkers/markdown_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from langchain_text_splitters.markdown import ExperimentalMarkdownSyntaxTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# When chunk_size is not configured, cap any heading-section that exceeds this
# so that form-based PDFs (tables/bold but no # headings) are not left as a
# single multi-thousand-character chunk.
_DEFAULT_FALLBACK_SIZE = 4096


class MarkdownChunker(BaseChunker):

Expand All @@ -25,31 +30,33 @@ def __init__(
chunk_size: int = 0,
chunk_overlap: int = 0
):
self.chunk_size = chunk_size
self.chunk_size = chunk_size if chunk_size > 0 else _DEFAULT_FALLBACK_SIZE
self.chunk_overlap = chunk_overlap

def chunk(self, input_string):
md_splitter = ExperimentalMarkdownSyntaxTextSplitter()

# ExperimentalMarkdownSyntaxTextSplitter splits on # headings only.
# Documents without headings (e.g. form PDFs with tables/bold but no #)
# are returned as a single section, so a recursive fallback is always
# applied when any section exceeds the configured (or default) limit.
initial_chunks = [x.page_content for x in md_splitter.split_text(input_string)]
md_chunks = []

if self.chunk_size > 0:
if any(len(chunk) > self.chunk_size for chunk in initial_chunks):
recursive_splitter = RecursiveCharacterTextSplitter(
separators=TEXT_SEPARATORS,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
)

if any(len(chunk) > self.chunk_size for chunk in initial_chunks):
for chunk in initial_chunks:
if len(chunk) > self.chunk_size:
# Split oversized chunks further
md_chunks.extend(recursive_splitter.split_text(chunk))
else:
md_chunks.append(chunk)

return md_chunks if md_chunks else initial_chunks
md_chunks = []
for chunk in initial_chunks:
if len(chunk) > self.chunk_size:
md_chunks.extend(recursive_splitter.split_text(chunk))
else:
md_chunks.append(chunk)
return md_chunks

return initial_chunks

def __call__(self, input_string):
return self.chunk(input_string)
6 changes: 4 additions & 2 deletions common/chunkers/recursive_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
from common.chunkers.separators import TEXT_SEPARATORS
from langchain.text_splitter import RecursiveCharacterTextSplitter

_DEFAULT_FALLBACK_SIZE = 4096


class RecursiveChunker(BaseChunker):
def __init__(self, chunk_size=1024, overlap_size=0):
self.chunk_size = chunk_size
def __init__(self, chunk_size=0, overlap_size=0):
self.chunk_size = chunk_size if chunk_size > 0 else _DEFAULT_FALLBACK_SIZE
self.overlap_size = overlap_size

def chunk(self, input_string):
Expand Down
3 changes: 2 additions & 1 deletion common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,9 @@ def get_multimodal_service() -> LLM_Model:
gsPort=db_config.get("gsPort", "14240"),
restppPort=db_config.get("restppPort", "9000"),
graphname=db_config.get("graphname", ""),
apiToken=db_config.get("apiToken", ""),
)
if db_config.get("getToken"):
if not db_config.get("apiToken") and db_config.get("getToken"):
conn.getToken()

embedding_store = TigerGraphEmbeddingStore(
Expand Down
30 changes: 29 additions & 1 deletion common/db/connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,34 @@ def get_db_connection_pwd_manual(
return conn

def elevate_db_connection_to_token(host, username, password, graphname, async_conn: bool = False) -> TigerGraphConnectionProxy:
# If a pre-existing apiToken is provided in config, use it directly
# and skip the getToken() call to avoid conflicts.
static_token = db_config.get("apiToken", "")

if static_token:
LogWriter.info("Using pre-configured apiToken from db_config")
if async_conn:
conn = AsyncTigerGraphConnection(
host=host,
username=username,
password=password,
graphname=graphname,
apiToken=static_token,
restppPort=db_config.get("restppPort", "9000"),
gsPort=db_config.get("gsPort", "14240"),
)
else:
conn = TigerGraphConnection(
host=host,
username=username,
password=password,
graphname=graphname,
apiToken=static_token,
restppPort=db_config.get("restppPort", "9000"),
gsPort=db_config.get("gsPort", "14240"),
)
return conn

conn = TigerGraphConnection(
host=host,
username=username,
Expand All @@ -129,7 +157,7 @@ def elevate_db_connection_to_token(host, username, password, graphname, async_co
gsPort=db_config.get("gsPort", "14240")
)

if db_config["getToken"]:
if db_config.get("getToken"):
try:
apiToken = conn.getToken()[0]
except HTTPError:
Expand Down
10 changes: 8 additions & 2 deletions common/llm_services/base_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,19 @@ def route_response_prompt(self):
prompt = """\
You are an expert at routing a user question to a vectorstore, function calls, or conversation history.
Use the conversation history for questions that are similar to previous ones or that reference earlier answers or responses.
Use the vectorstore for questions on that would be best suited by text documents.
Use the vectorstore for questions that would be best suited by text documents.
Use the function calls for questions that ask about structured data, or operations on structured data.
Questions referring to same entities in a previous, earlier, or above answer or response should be routed to the conversation history.
Keep in mind that some questions about documents such as "how many documents are there?" can be answered by function calls.
The function calls can be used to answer questions about these entities: {v_types} and relationships: {e_types}.
IMPORTANT: Questions about graph database statistics or metadata MUST be routed to function calls. This includes:
- Counting vertices/nodes/edges (e.g. "how many vertices are there", "how many edges in the graph")
- Listing or describing vertex/edge types, schema, or graph structure
- Aggregations, totals, or summaries of data stored in the graph database
- Any question mentioning "graph", "graph db", "graph database", "vertices", "nodes", or "edges" in the context of statistics or counts
These are database queries, NOT document lookups — always route them to function calls.
Otherwise, use vectorstore. Choose one of 'functions', 'vectorstore', or 'history' based on the question and conversation history.
Return the a JSON with a single key 'datasource' and no premable or explaination.
Return a JSON with a single key 'datasource' and no preamble or explanation.
Question to route: {question}
Conversation history: {conversation}
Format: {format_instructions}\
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
2 changes: 1 addition & 1 deletion common/prompts/aws_bedrock_titan/generate_function.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
2 changes: 1 addition & 1 deletion common/prompts/custom/aml/generate_function.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
2 changes: 1 addition & 1 deletion common/prompts/gcp_vertexai_palm/generate_function.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
2 changes: 1 addition & 1 deletion common/prompts/google_gemini/generate_function.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
2 changes: 1 addition & 1 deletion common/prompts/llama_70b/generate_function.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Use the vertex types, edge types, and their attributes and IDs below to write the pyTigerGraph function call to answer the question using a pyTigerGraph connection.
When the question asks for "How many", make sure to always select a function that contains "Count" in the description/function call. Make sure never to generate a function that is not listed below.
When the question asks for "How many", counts, totals, or statistics about vertices/nodes/edges in the graph or graph database, make sure to always select a function that contains "Count" in the description/function call. For example, questions like "how many vertices are there in the graph" or "how many vertices are there in the graph db" should use getVertexCount or getEdgeCount. Make sure never to generate a function that is not listed below.
When certain entities are mapped to vertex attributes, may consider to generate a WHERE clause.
If a WHERE clause is generated, please follow the instruction with proper quoting. To construct a WHERE clause string. Ensure that string attribute values are properly quoted.
For example, if the generated function contains "('Person', where='name=William Torres')", Expected Output: "('Person', where='name="William Torres"')", This rule applies to all types of attributes. e.g., name, email, address and so on.
Expand Down
Loading
Loading