Skip to content

Commit 6d03cba

Browse files
committed
�[?25hAll tests Updated
1 parent 7caf7b7 commit 6d03cba

19 files changed

Lines changed: 650 additions & 336 deletions

main.py

Lines changed: 181 additions & 109 deletions
Large diffs are not rendered by default.

src/formatter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
- IOError on write → logs error and re-raises
1818
- IOError on validate → logs error and returns invalid result
1919
"""
20+
2021
from __future__ import annotations
2122

2223
import os
@@ -146,4 +147,4 @@ def validate_guide(self, file_path: str) -> ValidationResult:
146147
warnings=[f"Failed to read file: {exc}"],
147148
)
148149

149-
return OutputValidator.validate(content)
150+
return OutputValidator.validate(content)

src/model_engine.py

Lines changed: 139 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
• Content capping prevents context window overflow
1919
• Encoding fallback chain: UTF-8 → Latin-1 → error-replace (never crashes)
2020
"""
21+
2122
from __future__ import annotations
2223

2324
import os
@@ -86,68 +87,130 @@ class MimeClassifier:
8687
"""
8788

8889
# ── Image formats ─────────────────────────────────────────────────
89-
IMAGE_TYPES: frozenset = frozenset({
90-
"image/jpeg", "image/png", "image/gif", "image/bmp",
91-
"image/tiff", "image/webp", "image/svg+xml", "image/heic",
92-
"image/heif", "image/x-icon", "image/vnd.microsoft.icon",
93-
})
90+
IMAGE_TYPES: frozenset = frozenset(
91+
{
92+
"image/jpeg",
93+
"image/png",
94+
"image/gif",
95+
"image/bmp",
96+
"image/tiff",
97+
"image/webp",
98+
"image/svg+xml",
99+
"image/heic",
100+
"image/heif",
101+
"image/x-icon",
102+
"image/vnd.microsoft.icon",
103+
}
104+
)
94105

95106
# ── PDF ────────────────────────────────────────────────────────────
96107
PDF_TYPES: frozenset = frozenset({"application/pdf"})
97108

98109
# ── Office documents (ZIP archives with XML content) ──────────────
99-
OFFICE_TYPES: frozenset = frozenset({
100-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # .docx
101-
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # .pptx
102-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # .xlsx
103-
"application/vnd.oasis.opendocument.text", # .odt
104-
"application/vnd.oasis.opendocument.spreadsheet", # .ods
105-
"application/vnd.oasis.opendocument.presentation", # .odp
106-
"application/msword", # .doc
107-
"application/vnd.ms-excel", # .xls
108-
"application/vnd.ms-powerpoint", # .ppt
109-
})
110+
OFFICE_TYPES: frozenset = frozenset(
111+
{
112+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # .docx
113+
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # .pptx
114+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # .xlsx
115+
"application/vnd.oasis.opendocument.text", # .odt
116+
"application/vnd.oasis.opendocument.spreadsheet", # .ods
117+
"application/vnd.oasis.opendocument.presentation", # .odp
118+
"application/msword", # .doc
119+
"application/vnd.ms-excel", # .xls
120+
"application/vnd.ms-powerpoint", # .ppt
121+
}
122+
)
110123

111124
# ── Structured data ───────────────────────────────────────────────
112-
STRUCTURED_TYPES: frozenset = frozenset({
113-
"application/json", "text/csv", "text/xml", "application/xml",
114-
"text/yaml", "text/x-yaml", "application/x-yaml",
115-
"text/tab-separated-values",
116-
})
125+
STRUCTURED_TYPES: frozenset = frozenset(
126+
{
127+
"application/json",
128+
"text/csv",
129+
"text/xml",
130+
"application/xml",
131+
"text/yaml",
132+
"text/x-yaml",
133+
"application/x-yaml",
134+
"text/tab-separated-values",
135+
}
136+
)
117137

118138
# ── Text-readable (code, markup, config, etc.) ────────────────────
119-
TEXT_TYPES: frozenset = frozenset({
120-
"text/plain", "text/html", "text/css", "text/javascript",
121-
"text/x-python", "text/x-java", "text/x-c", "text/x-c++",
122-
"text/x-go", "text/x-rust", "text/x-ruby", "text/x-perl",
123-
"text/x-shellscript", "text/x-sh", "text/x-script.python",
124-
"text/markdown", "text/x-markdown", "text/x-rst",
125-
"text/x-tex", "text/x-latex",
126-
"text/x-diff", "text/x-patch",
127-
"text/x-log", "text/x-config",
128-
"application/javascript", "application/typescript",
129-
"application/x-httpd-php", "application/x-sh",
130-
"application/x-python-code",
131-
})
139+
TEXT_TYPES: frozenset = frozenset(
140+
{
141+
"text/plain",
142+
"text/html",
143+
"text/css",
144+
"text/javascript",
145+
"text/x-python",
146+
"text/x-java",
147+
"text/x-c",
148+
"text/x-c++",
149+
"text/x-go",
150+
"text/x-rust",
151+
"text/x-ruby",
152+
"text/x-perl",
153+
"text/x-shellscript",
154+
"text/x-sh",
155+
"text/x-script.python",
156+
"text/markdown",
157+
"text/x-markdown",
158+
"text/x-rst",
159+
"text/x-tex",
160+
"text/x-latex",
161+
"text/x-diff",
162+
"text/x-patch",
163+
"text/x-log",
164+
"text/x-config",
165+
"application/javascript",
166+
"application/typescript",
167+
"application/x-httpd-php",
168+
"application/x-sh",
169+
"application/x-python-code",
170+
}
171+
)
132172

133173
# ── Binary (not text-readable) ────────────────────────────────────
134-
BINARY_TYPES: frozenset = frozenset({
135-
"application/octet-stream", "application/zip", "application/gzip",
136-
"application/x-tar", "application/x-7z-compressed",
137-
"application/x-rar-compressed", "application/java-archive",
138-
"application/x-executable", "application/x-mach-binary",
139-
"application/x-sharedlib", "application/x-object",
140-
"application/wasm", "application/x-sqlite3",
141-
"audio/mpeg", "audio/wav", "audio/ogg", "audio/flac",
142-
"video/mp4", "video/x-matroska", "video/quicktime",
143-
"font/ttf", "font/otf", "font/woff", "font/woff2",
144-
})
174+
BINARY_TYPES: frozenset = frozenset(
175+
{
176+
"application/octet-stream",
177+
"application/zip",
178+
"application/gzip",
179+
"application/x-tar",
180+
"application/x-7z-compressed",
181+
"application/x-rar-compressed",
182+
"application/java-archive",
183+
"application/x-executable",
184+
"application/x-mach-binary",
185+
"application/x-sharedlib",
186+
"application/x-object",
187+
"application/wasm",
188+
"application/x-sqlite3",
189+
"audio/mpeg",
190+
"audio/wav",
191+
"audio/ogg",
192+
"audio/flac",
193+
"video/mp4",
194+
"video/x-matroska",
195+
"video/quicktime",
196+
"font/ttf",
197+
"font/otf",
198+
"font/woff",
199+
"font/woff2",
200+
}
201+
)
145202

146203
# ── Binary MIME prefixes for heuristic fallback ───────────────────
147204
_BINARY_PREFIXES: tuple = ("audio/", "video/", "font/")
148205
_BINARY_KEYWORDS: tuple = (
149-
"octet-stream", "executable", "archive",
150-
"compressed", "x-mach", "sqlite", "x-object", "x-sharedlib",
206+
"octet-stream",
207+
"executable",
208+
"archive",
209+
"compressed",
210+
"x-mach",
211+
"sqlite",
212+
"x-object",
213+
"x-sharedlib",
151214
)
152215

153216
@classmethod
@@ -462,7 +525,8 @@ def __init__(self, model_path: str = "google/gemma-3-4b-it") -> None:
462525

463526
log.info(
464527
"Engine operational — template: %d → %d chars",
465-
len(self.master_template), len(self._prompt_template),
528+
len(self.master_template),
529+
len(self._prompt_template),
466530
)
467531

468532
# ── Backward-compatible class methods (used by existing tests) ────
@@ -556,11 +620,11 @@ def _format_and_stream(
556620
Returns:
557621
Post-processed study guide markdown.
558622
"""
559-
prompt_text = self._build_system_prompt(
560-
raw_content=content, is_image=is_image
561-
)
623+
prompt_text = self._build_system_prompt(raw_content=content, is_image=is_image)
562624

563-
messages = [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}]
625+
messages = [
626+
{"role": "user", "content": [{"type": "text", "text": prompt_text}]}
627+
]
564628
formatted_prompt = self.tokenizer.apply_chat_template(
565629
messages, tokenize=False, add_generation_prompt=True
566630
)
@@ -592,24 +656,32 @@ def process_resource(
592656
strategy = MimeClassifier.classify(resource.mime_type)
593657
log.info(
594658
"Processing %s → strategy=%s (mime=%s)",
595-
os.path.basename(resource.file_path), strategy, resource.mime_type,
659+
os.path.basename(resource.file_path),
660+
strategy,
661+
resource.mime_type,
596662
)
597663

598664
# Route to the correct analyzer
599665
dispatch = {
600-
"image": lambda: self._analyze_image(resource.file_path, on_token),
601-
"pdf": lambda: self._analyze_pdf(resource.file_path, on_token),
602-
"office": lambda: self._analyze_office(resource.file_path, on_token),
603-
"structured": lambda: self._analyze_structured(resource.file_path, resource.mime_type, on_token),
604-
"binary": lambda: self._analyze_binary(resource.file_path, on_token),
666+
"image": lambda: self._analyze_image(resource.file_path, on_token),
667+
"pdf": lambda: self._analyze_pdf(resource.file_path, on_token),
668+
"office": lambda: self._analyze_office(resource.file_path, on_token),
669+
"structured": lambda: self._analyze_structured(
670+
resource.file_path, resource.mime_type, on_token
671+
),
672+
"binary": lambda: self._analyze_binary(resource.file_path, on_token),
605673
}
606674

607-
analyzer = dispatch.get(strategy, lambda: self._analyze_text(resource.file_path, on_token))
675+
analyzer = dispatch.get(
676+
strategy, lambda: self._analyze_text(resource.file_path, on_token)
677+
)
608678
return analyzer()
609679

610680
# ── Private Analyzers ─────────────────────────────────────────────
611681

612-
def _analyze_image(self, image_path: str, on_token: Optional[Callable] = None) -> str:
682+
def _analyze_image(
683+
self, image_path: str, on_token: Optional[Callable] = None
684+
) -> str:
613685
"""
614686
Multimodal analysis for screenshots, diagrams, and photos.
615687
@@ -685,7 +757,9 @@ def _analyze_pdf(self, file_path: str, on_token: Optional[Callable] = None) -> s
685757
log.error("PDF analysis failed for %s: %s", file_path, exc)
686758
return f"S T A R R Y N O T E PDF Error: {exc}"
687759

688-
def _analyze_office(self, file_path: str, on_token: Optional[Callable] = None) -> str:
760+
def _analyze_office(
761+
self, file_path: str, on_token: Optional[Callable] = None
762+
) -> str:
689763
"""
690764
Office document analysis (.docx, .pptx, .xlsx, .odt).
691765
@@ -744,7 +818,9 @@ def _analyze_structured(
744818
log.error("Structured data analysis failed for %s: %s", file_path, exc)
745819
return f"S T A R R Y N O T E Structured Data Error: {exc}"
746820

747-
def _analyze_binary(self, file_path: str, on_token: Optional[Callable] = None) -> str:
821+
def _analyze_binary(
822+
self, file_path: str, on_token: Optional[Callable] = None
823+
) -> str:
748824
"""
749825
Binary file analysis via metadata summarization.
750826
@@ -788,4 +864,4 @@ def _analyze_text(self, file_path: str, on_token: Optional[Callable] = None) ->
788864
return self._format_and_stream(content=content, on_token=on_token)
789865
except Exception as exc:
790866
log.error("Text analysis failed for %s: %s", file_path, exc)
791-
return f"S T A R R Y N O T E Text Error: {exc}"
867+
return f"S T A R R Y N O T E Text Error: {exc}"

src/postprocessor.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
This avoids recompilation on every call — critical when
1919
processing batches of files.
2020
"""
21+
2122
from __future__ import annotations
2223

2324
import re
@@ -69,10 +70,16 @@ class MermaidFixer:
6970
_RE_TRAILING_SEMI = re.compile(r";(\s*)$", re.MULTILINE)
7071

7172
# Valid diagram type declarations that support classDef
72-
_VALID_TYPES = frozenset({
73-
"graph TD", "graph LR", "graph TB",
74-
"flowchart TD", "flowchart LR", "flowchart TB",
75-
})
73+
_VALID_TYPES = frozenset(
74+
{
75+
"graph TD",
76+
"graph LR",
77+
"graph TB",
78+
"flowchart TD",
79+
"flowchart LR",
80+
"flowchart TB",
81+
}
82+
)
7683

7784
@classmethod
7885
def fix(cls, text: str) -> str:
@@ -109,6 +116,7 @@ def _inject_classdef(cls, text: str) -> str:
109116
type line (graph TD, flowchart LR, etc.) if they are not
110117
already present in the block.
111118
"""
119+
112120
def _ensure_classdef(match: re.Match) -> str:
113121
block = match.group(0)
114122

@@ -137,6 +145,7 @@ def _remove_inline_styles(cls, text: str) -> str:
137145
The LLM sometimes generates `style NodeID fill:red` directives
138146
that conflict with the classDef-based styling system.
139147
"""
148+
140149
def _clean_block(match: re.Match) -> str:
141150
return cls._RE_INLINE_STYLE.sub("", match.group(0))
142151

@@ -150,6 +159,7 @@ def _remove_semicolons(cls, text: str) -> str:
150159
Mermaid.js v10+ does not use semicolons, but the LLM
151160
sometimes generates them from JavaScript/Java training data.
152161
"""
162+
153163
def _clean_block(match: re.Match) -> str:
154164
return cls._RE_TRAILING_SEMI.sub(r"\1", match.group(0))
155165

@@ -304,9 +314,7 @@ def validate(cls, text: str) -> ValidationResult:
304314
result.warnings.append("No Mermaid diagram found")
305315

306316
# ── Exam question check ───────────────────────────────────
307-
result.has_exam_questions = (
308-
"QUESTION 01" in text or "QUESTION 1" in text
309-
)
317+
result.has_exam_questions = "QUESTION 01" in text or "QUESTION 1" in text
310318
if not result.has_exam_questions:
311319
result.warnings.append("No exam questions found")
312320

0 commit comments

Comments
 (0)