Intugle
diff --git a/‎main.ipynb‎
Lines changed: 0 additions & 116 deletions b/‎main.ipynb‎
Lines changed: 0 additions & 116 deletions
diff --git a/‎notebooks/sql_generator.ipynb‎
Lines changed: 130 additions & 0 deletions b/‎notebooks/sql_generator.ipynb‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎notebooks/upstream.ipynb‎
Lines changed: 1652 additions & 0 deletions b/‎notebooks/upstream.ipynb‎
Lines changed: 1652 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/data_tools/analysis/models.py‎
Lines changed: 56 additions & 1 deletion b/‎src/data_tools/analysis/models.py‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎src/data_tools/analysis/steps.py‎
Lines changed: 33 additions & 27 deletions b/‎src/data_tools/analysis/steps.py‎
Lines changed: 33 additions & 27 deletions
diff --git a/‎src/data_tools/core/settings.py‎
Lines changed: 2 additions & 2 deletions b/‎src/data_tools/core/settings.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/data_tools/dataframes/dataframe.py‎
Lines changed: 6 additions & 6 deletions b/‎src/data_tools/dataframes/dataframe.py‎
Lines changed: 6 additions & 6 deletions
@@ -46,10 +46,6 @@ test = [
     "pytest-asyncio>=1.1.0",
 ]
 lint = ["ruff"]
-dev = [
-    "pytest>=8.4.1",
-    "pytest-cov>=6.2.1",
-]
 
 [tool.ruff]
 src = ["src"]
 
@@ -1,8 +1,17 @@
+import json
+import os
 import uuid
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
+import yaml
+
+from data_tools.common.exception import errors
+from data_tools.core import settings
 from data_tools.dataframes.factory import DataFrameFactory
+from data_tools.dataframes.models import ColumnProfile
+from data_tools.models.resources.model import Column, ColumnProfilingMetrics
+from data_tools.models.resources.source import Source, SourceTables
 
 
 class DataSet:
@@ -22,3 +31,49 @@ def __init__(self, df: Any, name: str):
 
         # A dictionary to store the results of each analysis step
         self.results: Dict[str, Any] = {}
+    
+    # FIXME - this is a temporary solution to save the results of the analysis
+    # need to use model while executing the pipeline
+    def save_yaml(self, file_path: Optional[str] = None) -> None:
+        if file_path is None:
+            file_path = f"{self.name}.yml"
+        file_path = os.path.join(settings.PROJECT_BASE, file_path)
+
+        column_profiles = self.results.get("column_profiles")
+
+        table_description = self.results.get("table_glossary")
+        table_tags = self.results.get("business_glossary_and_tags")
+
+        if column_profiles is None or table_description is None or table_tags is None:
+            raise errors.NotFoundError(
+                "Column profiles not found in the dataset results. Ensure profiling steps were executed."
+            )
+
+        columns: list[Column] = []
+
+        for column_profile in column_profiles.values():
+            column_profile = ColumnProfile.model_validate(column_profile)
+            column = Column(
+                name=column_profile.column_name,
+                description=column_profile.business_glossary,
+                type=column_profile.datatype_l1,
+                category=column_profile.datatype_l2,
+                tags=column_profile.business_tags,
+                profiling_metrics=ColumnProfilingMetrics(
+                    count=column_profile.count,
+                    null_count=column_profile.null_count,
+                    distinct_count=column_profile.distinct_count,
+                    sample_data=column_profile.sample_data,
+                ),
+            )
+            columns.append(column)
+
+        table = SourceTables(name=self.name, description=table_description, columns=columns)
+
+        source = Source(name="healthcare", description=table_description, schema="public", database="", table=table)
+
+        sources = {"sources": [json.loads(source.model_dump_json())]}
+
+        # Save the YAML representation of the sources
+        with open(file_path, "w") as file:
+            yaml.dump(sources, file, sort_keys=False, default_flow_style=False)
@@ -38,20 +38,22 @@ def analyze(self, dataset: DataSet) -> None:
         Performs column-level profiling for each column.
         This step depends on the 'table_profile' result.
         """
-        
+
         # Dependency check
-        if 'table_profile' not in dataset.results:
+        if "table_profile" not in dataset.results:
             raise RuntimeError("TableProfiler must be run before ColumnProfiler.")
 
-        table_profile: ProfilingOutput = dataset.results['table_profile']
+        table_profile: ProfilingOutput = dataset.results["table_profile"]
         all_column_profiles = {}
 
         for col_name in table_profile.columns:
             # We would add a method to our DataFrame wrapper to get stats for a single column
-            stats = dataset.dataframe_wrapper.column_profile(dataset.raw_df, dataset.name, col_name, settings.UPSTREAM_SAMPLE_LIMIT)
+            stats = dataset.dataframe_wrapper.column_profile(
+                dataset.raw_df, dataset.name, col_name, settings.UPSTREAM_SAMPLE_LIMIT
+            )
             all_column_profiles[col_name] = stats
-            
-        dataset.results['column_profiles'] = all_column_profiles
+
+        dataset.results["column_profiles"] = all_column_profiles
 
 
 class DataTypeIdentifierL1(AnalysisStep):
@@ -60,19 +62,21 @@ def analyze(self, dataset: DataSet) -> None:
         Performs datatype identification level 1 for each column.
         This step depends on the 'column_profiles' result.
         """
-        
+
         # Dependency check
-        if 'column_profiles' not in dataset.results:
+        if "column_profiles" not in dataset.results:
             raise RuntimeError("TableProfiler and ColumnProfiler must be run before DatatypeIdentifierL1.")
 
-        column_profiles: dict[str, ColumnProfile] = dataset.results['column_profiles']
+        column_profiles: dict[str, ColumnProfile] = dataset.results["column_profiles"]
 
-        column_datatypes_l1 = dataset.dataframe_wrapper.datatype_identification_l1(dataset.raw_df, dataset.name, column_profiles)
+        column_datatypes_l1 = dataset.dataframe_wrapper.datatype_identification_l1(
+            dataset.raw_df, dataset.name, column_profiles
+        )
 
         for column in column_datatypes_l1:
             column_profiles[column.column_name].datatype_l1 = column.datatype_l1
 
-        dataset.results['column_datatypes_l1'] = column_datatypes_l1
+        dataset.results["column_datatypes_l1"] = column_datatypes_l1
 
 
 class DataTypeIdentifierL2(AnalysisStep):
@@ -81,19 +85,21 @@ def analyze(self, dataset: DataSet) -> None:
         Performs datatype identification level 2 for each column.
         This step depends on the 'column_datatypes_l1' result.
         """
-        
+
         # Dependency check
-        if 'column_profiles' not in dataset.results:
+        if "column_profiles" not in dataset.results:
             raise RuntimeError("TableProfiler and ColumnProfiler  must be run before DatatypeIdentifierL2.")
 
-        column_profiles: dict[str, ColumnProfile] = dataset.results['column_profiles']
+        column_profiles: dict[str, ColumnProfile] = dataset.results["column_profiles"]
         columns_with_samples = [DataTypeIdentificationL2Input(**col.model_dump()) for col in column_profiles.values()]
-        column_datatypes_l2 = dataset.dataframe_wrapper.datatype_identification_l2(dataset.raw_df, dataset.name, columns_with_samples)
+        column_datatypes_l2 = dataset.dataframe_wrapper.datatype_identification_l2(
+            dataset.raw_df, dataset.name, columns_with_samples
+        )
 
         for column in column_datatypes_l2:
             column_profiles[column.column_name].datatype_l2 = column.datatype_l2
 
-        dataset.results['column_datatypes_l2'] = column_datatypes_l2
+        dataset.results["column_datatypes_l2"] = column_datatypes_l2
 
 
 class KeyIdentifier(AnalysisStep):
@@ -102,21 +108,22 @@ def analyze(self, dataset: DataSet) -> None:
         Performs key identification for the dataset.
         This step depends on the datatype identification results.
         """
-        if 'column_datatypes_l1' not in dataset.results or 'column_datatypes_l2' not in dataset.results:
+        if "column_datatypes_l1" not in dataset.results or "column_datatypes_l2" not in dataset.results:
             raise RuntimeError("DataTypeIdentifierL1 and L2 must be run before KeyIdentifier.")
-        
-        column_profiles: dict[str, ColumnProfile] = dataset.results['column_profiles']
+
+        column_profiles: dict[str, ColumnProfile] = dataset.results["column_profiles"]
         column_profiles_df = pd.DataFrame([col.model_dump() for col in column_profiles.values()])
 
         key = dataset.dataframe_wrapper.key_identification(dataset.name, column_profiles_df)
-        dataset.results["key"] = key
+        if key is not None:
+            dataset.results["key"] = key
 
 
 class BusinessGlossaryGenerator(AnalysisStep):
     def __init__(self, domain: str):
         """
         Initializes the BusinessGlossaryGenerator with optional additional context.
-        
+
         :param domain: The industry domain to which the dataset belongs.
         """
         self.domain = domain
@@ -125,10 +132,10 @@ def analyze(self, dataset: DataSet) -> None:
         """
         Generates business glossary terms and tags for each column in the dataset.
         """
-        if 'column_datatypes_l1' not in dataset.results:
+        if "column_datatypes_l1" not in dataset.results:
             raise RuntimeError("DataTypeIdentifierL1  must be run before Business Glossary Generation.")
-        
-        column_profiles: dict[str, ColumnProfile] = dataset.results['column_profiles']
+
+        column_profiles: dict[str, ColumnProfile] = dataset.results["column_profiles"]
         column_profiles_df = pd.DataFrame([col.model_dump() for col in column_profiles.values()])
 
         glossary_output = dataset.dataframe_wrapper.generate_business_glossary(
@@ -138,7 +145,6 @@ def analyze(self, dataset: DataSet) -> None:
         for column in glossary_output.columns:
             column_profiles[column.column_name].business_glossary = column.business_glossary
             column_profiles[column.column_name].business_tags = column.business_tags
-        
-        dataset.results["business_glossary_and_tags"] = glossary_output
-        dataset.results['table_glossary'] = glossary_output.table_glossary
 
+        dataset.results["business_glossary_and_tags"] = glossary_output
+        dataset.results["table_glossary"] = glossary_output.table_glossary
@@ -13,7 +13,7 @@
 class Settings(BaseSettings):
     """Global Configuration"""
 
-    UPSTREAM_SAMPLE_LIMIT: int = 10000
+    UPSTREAM_SAMPLE_LIMIT: int = 10
     MODEL_DIR_PATH: str = str(Path(os.path.split(os.path.abspath(__file__))[0]).parent.joinpath("artifacts"))
     MODEL_RESULTS_PATH: str = os.path.join("model", "model_results")
 
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
 
     DI_MODEL_VERSION: str = "13052023"
 
-    PROJECT_BASE: str = "/home/juhel-phanju/Documents/backup/MIGRATION/codes/poc/dbt/ecom/ecom/models"
+    PROJECT_BASE: str
 
     MCP_SERVER_NAME: str = "data-tools"
     MCP_SERVER_DESCRIPTION: str = "Data Tools for MCP"
 
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Optional
 
 import pandas as pd
 
@@ -20,7 +20,7 @@
 )
 
 
-class DataframeAdatper(ABC):
+class DataFrame(ABC):
     @abstractmethod
     def profile(self, df: Any) -> ProfilingOutput:
         pass
@@ -31,7 +31,7 @@ def column_profile(
         df: Any,
         table_name: str,
         column_name: str,
-        sample_limit: int = 200,
+        sample_limit: int = 10,
     ) -> ColumnProfile:
         pass
 
@@ -94,7 +94,7 @@ def key_identification(
         self,
         table_name: str,
         column_stats: pd.DataFrame,
-    ) -> KeyIdentificationOutput:
+    ) -> Optional[str]:
         """
         Identifies potential primary keys in the DataFrame based on column profiles.
 
@@ -104,12 +104,12 @@ def key_identification(
                           `column_profile` method.
 
         Returns:
-            A KeyIdentificationOutput model containing the identified primary key column.
+            A string (column name) containing the identified primary key column.
         """
         ki_model = KeyIdentificationLLM(profiling_data=column_stats)
         ki_result = ki_model()
         output = KeyIdentificationOutput(**ki_result)
-        return output
+        return output.column_name
 
     def generate_business_glossary(
         self,