Intugle
diff --git a/‎docsite/package-lock.json‎
Lines changed: 25 additions & 34 deletions b/‎docsite/package-lock.json‎
Lines changed: 25 additions & 34 deletions
diff --git a/‎docsite/package.json‎
Lines changed: 5 additions & 1 deletion b/‎docsite/package.json‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 22 additions & 7 deletions b/‎pyproject.toml‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎src/intugle/core/conceptual_search/agent/tools/web_tools.py‎
Lines changed: 1 addition & 1 deletion b/‎src/intugle/core/conceptual_search/agent/tools/web_tools.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/intugle/core/llms/chat.py‎
Lines changed: 5 additions & 11 deletions b/‎src/intugle/core/llms/chat.py‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎src/intugle/core/pipeline/business_glossary/prompts.py‎
Lines changed: 21 additions & 16 deletions b/‎src/intugle/core/pipeline/business_glossary/prompts.py‎
Lines changed: 21 additions & 16 deletions
diff --git a/‎src/intugle/core/pipeline/business_glossary/utils.py‎
Lines changed: 2 additions & 5 deletions b/‎src/intugle/core/pipeline/business_glossary/utils.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/intugle/core/pipeline/datatype_identification/l2_model.py‎
Lines changed: 2 additions & 5 deletions b/‎src/intugle/core/pipeline/datatype_identification/l2_model.py‎
Lines changed: 2 additions & 5 deletions
@@ -31,7 +31,11 @@
   },
   "overrides": {
     "svgo": "^3.3.3",
-    "serialize-javascript": "^7.0.3",
+    "serialize-javascript": "^7.0.5",
+    "node-forge": "^1.4.0",
+    "brace-expansion": "^1.1.13",
+    "path-to-regexp": "^0.1.13",
+    "picomatch": "^4.0.0",
     "minimatch": "^3.1.4",
     "ajv": "^8.18.0",
     "qs": "^6.14.2",
 
@@ -27,8 +27,8 @@ classifiers = [
 dependencies = [
     "langchain-community>=0.4.0",
     "langchain-openai>=1.0.0",
-    "langgraph>=1.1.1,<2.0.0",
-    "nltk>=3.9.3",
+    "langgraph>=1.1.3,<2.0.0",
+    "nltk>=3.9.4",
     "numpy<=2.3.0",
     "asyncpg>=0.30.0",
     "fastapi[standard]>=0.116.1",
@@ -54,10 +54,19 @@ dependencies = [
     "aiofiles>=23.2.1",
     "tavily-python>=0.1.11",
     "pillow>=12.1.1",
-    "cryptography>=46.0.5",
-    "filelock>=3.20.3",
-    "PyJWT>=2.12.0",
-    "orjson>=3.11.6",
+    "cryptography>=46.0.6",
+    "filelock>=3.25.0",
+    "PyJWT>=2.12.1",
+    "orjson>=3.11.7",
+    "tornado>=6.5.3",
+    "protobuf>=6.33.0",
+    "pyasn1>=0.6.3",
+    "marshmallow>=3.26.2",
+    "pygments>=2.20.0",
+    "pyopenssl>=26.0.0",
+    "langsmith>=0.4.50",
+    "fonttools>=4.62.1",
+    "requests>=2.33.0",
 ]
 
 [project.optional-dependencies]
@@ -99,7 +108,7 @@ oracle = [
 ]
 
 streamlit = [
-    "streamlit==1.50.0",
+    "streamlit>=1.51.0",
     "pyngrok==7.4.0",
     "python-dotenv==1.1.1",
     "xlsxwriter==3.2.9",
@@ -144,6 +153,12 @@ dev = [
     "twine>=6.1.0",
 ]
 
+[tool.uv]
+override-dependencies = [
+    # pysonar (dev-only) pins requests==2.32.5 which conflicts; override to fix CVE
+    "requests>=2.33.0",
+]
+
 [tool.ruff]
 src = ["src"]
 
 
@@ -1,6 +1,6 @@
 import logging
 
-from langchain.schema import Document
+from langchain_classic.schema import Document
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langchain_core.tools import tool
 
 
@@ -3,12 +3,12 @@
 from typing import TYPE_CHECKING, Optional
 
 from langchain.chat_models import init_chat_model
-from langchain.output_parsers import (
+from langchain_classic.output_parsers import (
     ResponseSchema,
     RetryWithErrorOutputParser,
     StructuredOutputParser,
 )
-from langchain.prompts import BaseChatPromptTemplate, ChatPromptTemplate
+from langchain_core.prompts import BaseChatPromptTemplate, ChatPromptTemplate
 from langchain_core.rate_limiters import InMemoryRateLimiter
 
 from intugle.core import settings
@@ -53,9 +53,7 @@ def __init__(
         self.prompt_template: BaseChatPromptTemplate = prompt_template  # prompt template
 
         self.output_parser = (
-            self.__output_parser_builder__(response_schemas=response_schemas)
-            if response_schemas is not None
-            else None
+            self.__output_parser_builder__(response_schemas=response_schemas) if response_schemas is not None else None
         )  # the built output parser
 
         self.format_instructions = (
@@ -74,9 +72,7 @@ def __output_parser_builder__(self, response_schemas: list[ResponseSchema] = Non
         for building the corresponding output paraser from the given ResponseSchema
         """
         parser = self.parser.from_response_schemas(response_schemas=response_schemas)
-        retry_parser = RetryWithErrorOutputParser.from_llm(
-            parser=parser, llm=self.model, max_retries=self.MAX_RETRIES
-        )
+        retry_parser = RetryWithErrorOutputParser.from_llm(parser=parser, llm=self.model, max_retries=self.MAX_RETRIES)
         return retry_parser
 
     @classmethod
@@ -97,9 +93,7 @@ def invoke(self, *args, **kwargs):
 
         sucessfull_parsing = False
 
-        prompt_value = self.llm_prompt.format_prompt(
-            format_instructions=self.format_instructions, **kwargs
-        )
+        prompt_value = self.llm_prompt.format_prompt(format_instructions=self.format_instructions, **kwargs)
         messages = prompt_value.to_messages()
         _message = messages
         response = ""
 
@@ -1,12 +1,18 @@
-from langchain.output_parsers import ResponseSchema
+from langchain_classic.output_parsers import ResponseSchema
 
 table_glossary = [ResponseSchema(name="table_glossary", description=" single-sentence business glossary definition")]
-column_glossary = [ResponseSchema(name="column_glossary", description="precise, single-sentence and non-technical business glossary definition")]
-column_tag_glossary = [ResponseSchema(name="column_tag_glossary", description="three precise and distinct business tags", type="list[str]")]
+column_glossary = [
+    ResponseSchema(
+        name="column_glossary", description="precise, single-sentence and non-technical business glossary definition"
+    )
+]
+column_tag_glossary = [
+    ResponseSchema(name="column_tag_glossary", description="three precise and distinct business tags", type="list[str]")
+]
 
 BUSINESS_GLOSSARY_PROMPTS = {
-"gpt-4o": {
-"TABLE_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
+    "gpt-4o": {
+        "TABLE_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
 generate a concise, non-technical business glossary definition for the table on a provided DDL statement.
 The definition should be written as a single sentence and clearly describe the business purpose or function.\n
 # Instructions
@@ -23,7 +29,7 @@
 # Output
 {format_instructions}    
 """,
-"BUSINESS_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
+        "BUSINESS_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
 generate a concise single-sentence business glossary definition for each column mentioned in the DDL statement.\n
 The definition should clearly describe the business purpose or function.\n
 
@@ -38,7 +44,7 @@
 {create_statements}\n
 {format_instructions}
 """,
-"BUSINESS_TAGS_TEMPLATE": """You are responsible for Data Governance in {domain}, your task is to generate three business tags for a column based on the DDL statements of a table given below.
+        "BUSINESS_TAGS_TEMPLATE": """You are responsible for Data Governance in {domain}, your task is to generate three business tags for a column based on the DDL statements of a table given below.
 Use the column's context within the DDL statement (e.g., its name, type, and table name) to infer relevant business tags. Focus on generating concise, domain-relevant, 
 and meaningful tags that align with the potential business use of the column.
 
@@ -65,10 +71,10 @@
 # Additional Context:
 {additional_context}\n
 {format_instructions}
-"""
-},
-"gpt-4o-mini": {
-"TABLE_GLOSSARY_TEMPLATE": """
+""",
+    },
+    "gpt-4o-mini": {
+        "TABLE_GLOSSARY_TEMPLATE": """
 Role: You are responsible for Data Governance in the {domain}.\n
 Task: You will be given a SQL DDL statement how `{table}` table is structured. Generate a concise, non-technical business glossary definition for `{table}` that clearly describe the business purpose or function.\n
 
@@ -89,7 +95,7 @@
 \n\n
 {format_instructions}
 """,
-"BUSINESS_GLOSSARY_TEMPLATE": """
+        "BUSINESS_GLOSSARY_TEMPLATE": """
 Role: You are responsible for Data Governance in the {domain}.\n
 Task: You will be given a SQL DDL statement how the attribute `{column}` is structured.\n
 
@@ -111,8 +117,7 @@
 {additional_context}\n\n
 {format_instructions}
 """,
-"BUSINESS_TAGS_TEMPLATE":
-"""
+        "BUSINESS_TAGS_TEMPLATE": """
 Role: You are responsible for Data Governance in the {domain}.\n
 Task: You will be given a SQL DDL statement how the attribute `{column}` is structured.\n
 
@@ -131,6 +136,6 @@
 # Additional Context:
 {additional_context}\n\n
 {format_instructions}
-"""
+""",
+    },
 }
-}
 
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from langchain.output_parsers import RetryOutputParser
+from langchain_classic.output_parsers import RetryOutputParser
 from langchain_core.prompt_values import StringPromptValue
 
 from intugle.core import settings
@@ -56,11 +56,8 @@ def get_additional_context(table_name: str, global_additional_context: str = "",
 
 
 def preprocess_profiling_df(profiling_data: pd.DataFrame):
-
     profiling_data = preprocess_profiling_data(
-        profiling_data=profiling_data,
-        sample_limit=settings.STRATA_SAMPLE_LIMIT,
-        dtypes_to_filter=None
+        profiling_data=profiling_data, sample_limit=settings.STRATA_SAMPLE_LIMIT, dtypes_to_filter=None
     )
 
     return profiling_data
@@ -8,7 +8,7 @@
 
 import pandas as pd
 
-from langchain.output_parsers import ResponseSchema
+from langchain_classic.output_parsers import ResponseSchema
 from tqdm.auto import tqdm
 
 from intugle.core.llms.chat import ChatModelLLM
@@ -82,9 +82,7 @@ def __classify_dim_measure__(self, table: str, column_name: str) -> str:
     def __call__(self, row) -> str:
         column_name = row["column_name"]
 
-        sample_data = adjust_sample(
-            sample_data=row["sample_data"], expected_size=settings.L2_SAMPLE_LIMIT
-        )
+        sample_data = adjust_sample(sample_data=row["sample_data"], expected_size=settings.L2_SAMPLE_LIMIT)
 
         table = pd.DataFrame(sample_data, columns=[column_name])
 
@@ -101,7 +99,6 @@ def __call__(
         self,
         l1_pred: pd.DataFrame,
     ):
-
         l1_pred["predicted_datatype_l2"] = l1_pred.progress_apply(
             self.__model,
             axis=1,