Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
22 changes: 9 additions & 13 deletions chatdku/core/tools/syllabi_tool/local_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
Local PDF/DOCX Ingestor
A command line utility that extracts structured data from PDFs and DOCX files
using SGLang hosted Qwen3 model and stores results in PostgreSQL database.
using large language model and stores results in PostgreSQL database.
"""

import argparse
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self, args):
self.args = args
self.setup_logging()
self.setup_database_connection()
self.setup_sglang_client()
self.setup_llm()
self.load_schema()
self.logger.info("Creating cursor.")
self.cursor = self.conn.cursor()
Expand Down Expand Up @@ -117,9 +117,7 @@ def setup_database_connection(self):
self.logger.error(f"Failed to connect to database: {e}")
sys.exit(1)

def setup_sglang_client(self):
"""Setup SGLang client for Qwen3 model"""
# SGLang serves models via OpenAI-compatible API
def setup_llm(self):
lm = dspy.LM(
model="openai/" + config.backup_llm,
api_base=config.backup_llm_url,
Expand All @@ -129,7 +127,7 @@ def setup_sglang_client(self):
temperature=config.llm_temperature,
)
dspy.configure(lm=lm)
self.logger.info(f"SGLang client configured for: {self.args.sglang_url}")
self.logger.info(f"LLM client configured for: {self.args.llm_url}")

def load_schema(self):
"""Load and validate JSON schema"""
Expand Down Expand Up @@ -236,7 +234,7 @@ def extract_docx_content(self, file_path: Path) -> str:
def extract_structured_data(
self, content: str, file_name: str
) -> Optional[Dict[str, Any]]:
"""Use SGLang + Qwen3 to extract structured data from content"""
"""Use LLM to extract structured data from content"""

# Create prompt for structured extraction based on schema
schema_description = json.dumps(self.schema, indent=2)
Expand Down Expand Up @@ -339,7 +337,6 @@ def process_file(self, file_path: Path):
self.logger.error(f"No content extracted from {file_path.name}")
return

# Extract structured data using SGLang + Qwen3
structured_data = self.extract_structured_data(content, file_path.name)

if structured_data:
Expand Down Expand Up @@ -429,7 +426,7 @@ def create_default_schema():
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(
description="Extract structured data from PDFs and DOCX files using SGLang + Qwen3",
description="Extract structured data from PDFs and DOCX files.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
Expand Down Expand Up @@ -478,17 +475,16 @@ def main():
help="Database table name for storing extracted data (default: documents)",
)

# SGLang configuration
parser.add_argument(
"--sglang-url",
"--llm-url",
default="http://localhost:8000/v1",
help="SGLang server URL (default: http://localhost:8000/v1)",
help="LLM server URL (default: http://localhost:8000/v1)",
)

parser.add_argument(
"--model-name",
default="Qwen/Qwen3-8B",
help="Model name for SGLang (default: Qwen/Qwen3-8B)",
help="Model name (default: Qwen/Qwen3-8B)",
)

# Utility arguments
Expand Down
Loading