1818 • Content capping prevents context window overflow
1919 • Encoding fallback chain: UTF-8 → Latin-1 → error-replace (never crashes)
2020"""
21+
2122from __future__ import annotations
2223
2324import os
@@ -86,68 +87,130 @@ class MimeClassifier:
8687 """
8788
8889 # ── Image formats ─────────────────────────────────────────────────
89- IMAGE_TYPES : frozenset = frozenset ({
90- "image/jpeg" , "image/png" , "image/gif" , "image/bmp" ,
91- "image/tiff" , "image/webp" , "image/svg+xml" , "image/heic" ,
92- "image/heif" , "image/x-icon" , "image/vnd.microsoft.icon" ,
93- })
90+ IMAGE_TYPES : frozenset = frozenset (
91+ {
92+ "image/jpeg" ,
93+ "image/png" ,
94+ "image/gif" ,
95+ "image/bmp" ,
96+ "image/tiff" ,
97+ "image/webp" ,
98+ "image/svg+xml" ,
99+ "image/heic" ,
100+ "image/heif" ,
101+ "image/x-icon" ,
102+ "image/vnd.microsoft.icon" ,
103+ }
104+ )
94105
95106 # ── PDF ────────────────────────────────────────────────────────────
96107 PDF_TYPES : frozenset = frozenset ({"application/pdf" })
97108
98109 # ── Office documents (ZIP archives with XML content) ──────────────
99- OFFICE_TYPES : frozenset = frozenset ({
100- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" , # .docx
101- "application/vnd.openxmlformats-officedocument.presentationml.presentation" , # .pptx
102- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" , # .xlsx
103- "application/vnd.oasis.opendocument.text" , # .odt
104- "application/vnd.oasis.opendocument.spreadsheet" , # .ods
105- "application/vnd.oasis.opendocument.presentation" , # .odp
106- "application/msword" , # .doc
107- "application/vnd.ms-excel" , # .xls
108- "application/vnd.ms-powerpoint" , # .ppt
109- })
110+ OFFICE_TYPES : frozenset = frozenset (
111+ {
112+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" , # .docx
113+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" , # .pptx
114+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" , # .xlsx
115+ "application/vnd.oasis.opendocument.text" , # .odt
116+ "application/vnd.oasis.opendocument.spreadsheet" , # .ods
117+ "application/vnd.oasis.opendocument.presentation" , # .odp
118+ "application/msword" , # .doc
119+ "application/vnd.ms-excel" , # .xls
120+ "application/vnd.ms-powerpoint" , # .ppt
121+ }
122+ )
110123
111124 # ── Structured data ───────────────────────────────────────────────
112- STRUCTURED_TYPES : frozenset = frozenset ({
113- "application/json" , "text/csv" , "text/xml" , "application/xml" ,
114- "text/yaml" , "text/x-yaml" , "application/x-yaml" ,
115- "text/tab-separated-values" ,
116- })
125+ STRUCTURED_TYPES : frozenset = frozenset (
126+ {
127+ "application/json" ,
128+ "text/csv" ,
129+ "text/xml" ,
130+ "application/xml" ,
131+ "text/yaml" ,
132+ "text/x-yaml" ,
133+ "application/x-yaml" ,
134+ "text/tab-separated-values" ,
135+ }
136+ )
117137
118138 # ── Text-readable (code, markup, config, etc.) ────────────────────
119- TEXT_TYPES : frozenset = frozenset ({
120- "text/plain" , "text/html" , "text/css" , "text/javascript" ,
121- "text/x-python" , "text/x-java" , "text/x-c" , "text/x-c++" ,
122- "text/x-go" , "text/x-rust" , "text/x-ruby" , "text/x-perl" ,
123- "text/x-shellscript" , "text/x-sh" , "text/x-script.python" ,
124- "text/markdown" , "text/x-markdown" , "text/x-rst" ,
125- "text/x-tex" , "text/x-latex" ,
126- "text/x-diff" , "text/x-patch" ,
127- "text/x-log" , "text/x-config" ,
128- "application/javascript" , "application/typescript" ,
129- "application/x-httpd-php" , "application/x-sh" ,
130- "application/x-python-code" ,
131- })
139+ TEXT_TYPES : frozenset = frozenset (
140+ {
141+ "text/plain" ,
142+ "text/html" ,
143+ "text/css" ,
144+ "text/javascript" ,
145+ "text/x-python" ,
146+ "text/x-java" ,
147+ "text/x-c" ,
148+ "text/x-c++" ,
149+ "text/x-go" ,
150+ "text/x-rust" ,
151+ "text/x-ruby" ,
152+ "text/x-perl" ,
153+ "text/x-shellscript" ,
154+ "text/x-sh" ,
155+ "text/x-script.python" ,
156+ "text/markdown" ,
157+ "text/x-markdown" ,
158+ "text/x-rst" ,
159+ "text/x-tex" ,
160+ "text/x-latex" ,
161+ "text/x-diff" ,
162+ "text/x-patch" ,
163+ "text/x-log" ,
164+ "text/x-config" ,
165+ "application/javascript" ,
166+ "application/typescript" ,
167+ "application/x-httpd-php" ,
168+ "application/x-sh" ,
169+ "application/x-python-code" ,
170+ }
171+ )
132172
133173 # ── Binary (not text-readable) ────────────────────────────────────
134- BINARY_TYPES : frozenset = frozenset ({
135- "application/octet-stream" , "application/zip" , "application/gzip" ,
136- "application/x-tar" , "application/x-7z-compressed" ,
137- "application/x-rar-compressed" , "application/java-archive" ,
138- "application/x-executable" , "application/x-mach-binary" ,
139- "application/x-sharedlib" , "application/x-object" ,
140- "application/wasm" , "application/x-sqlite3" ,
141- "audio/mpeg" , "audio/wav" , "audio/ogg" , "audio/flac" ,
142- "video/mp4" , "video/x-matroska" , "video/quicktime" ,
143- "font/ttf" , "font/otf" , "font/woff" , "font/woff2" ,
144- })
174+ BINARY_TYPES : frozenset = frozenset (
175+ {
176+ "application/octet-stream" ,
177+ "application/zip" ,
178+ "application/gzip" ,
179+ "application/x-tar" ,
180+ "application/x-7z-compressed" ,
181+ "application/x-rar-compressed" ,
182+ "application/java-archive" ,
183+ "application/x-executable" ,
184+ "application/x-mach-binary" ,
185+ "application/x-sharedlib" ,
186+ "application/x-object" ,
187+ "application/wasm" ,
188+ "application/x-sqlite3" ,
189+ "audio/mpeg" ,
190+ "audio/wav" ,
191+ "audio/ogg" ,
192+ "audio/flac" ,
193+ "video/mp4" ,
194+ "video/x-matroska" ,
195+ "video/quicktime" ,
196+ "font/ttf" ,
197+ "font/otf" ,
198+ "font/woff" ,
199+ "font/woff2" ,
200+ }
201+ )
145202
146203 # ── Binary MIME prefixes for heuristic fallback ───────────────────
147204 _BINARY_PREFIXES : tuple = ("audio/" , "video/" , "font/" )
148205 _BINARY_KEYWORDS : tuple = (
149- "octet-stream" , "executable" , "archive" ,
150- "compressed" , "x-mach" , "sqlite" , "x-object" , "x-sharedlib" ,
206+ "octet-stream" ,
207+ "executable" ,
208+ "archive" ,
209+ "compressed" ,
210+ "x-mach" ,
211+ "sqlite" ,
212+ "x-object" ,
213+ "x-sharedlib" ,
151214 )
152215
153216 @classmethod
@@ -462,7 +525,8 @@ def __init__(self, model_path: str = "google/gemma-3-4b-it") -> None:
462525
463526 log .info (
464527 "Engine operational — template: %d → %d chars" ,
465- len (self .master_template ), len (self ._prompt_template ),
528+ len (self .master_template ),
529+ len (self ._prompt_template ),
466530 )
467531
468532 # ── Backward-compatible class methods (used by existing tests) ────
@@ -556,11 +620,11 @@ def _format_and_stream(
556620 Returns:
557621 Post-processed study guide markdown.
558622 """
559- prompt_text = self ._build_system_prompt (
560- raw_content = content , is_image = is_image
561- )
623+ prompt_text = self ._build_system_prompt (raw_content = content , is_image = is_image )
562624
563- messages = [{"role" : "user" , "content" : [{"type" : "text" , "text" : prompt_text }]}]
625+ messages = [
626+ {"role" : "user" , "content" : [{"type" : "text" , "text" : prompt_text }]}
627+ ]
564628 formatted_prompt = self .tokenizer .apply_chat_template (
565629 messages , tokenize = False , add_generation_prompt = True
566630 )
@@ -592,24 +656,32 @@ def process_resource(
592656 strategy = MimeClassifier .classify (resource .mime_type )
593657 log .info (
594658 "Processing %s → strategy=%s (mime=%s)" ,
595- os .path .basename (resource .file_path ), strategy , resource .mime_type ,
659+ os .path .basename (resource .file_path ),
660+ strategy ,
661+ resource .mime_type ,
596662 )
597663
598664 # Route to the correct analyzer
599665 dispatch = {
600- "image" : lambda : self ._analyze_image (resource .file_path , on_token ),
601- "pdf" : lambda : self ._analyze_pdf (resource .file_path , on_token ),
602- "office" : lambda : self ._analyze_office (resource .file_path , on_token ),
603- "structured" : lambda : self ._analyze_structured (resource .file_path , resource .mime_type , on_token ),
604- "binary" : lambda : self ._analyze_binary (resource .file_path , on_token ),
666+ "image" : lambda : self ._analyze_image (resource .file_path , on_token ),
667+ "pdf" : lambda : self ._analyze_pdf (resource .file_path , on_token ),
668+ "office" : lambda : self ._analyze_office (resource .file_path , on_token ),
669+ "structured" : lambda : self ._analyze_structured (
670+ resource .file_path , resource .mime_type , on_token
671+ ),
672+ "binary" : lambda : self ._analyze_binary (resource .file_path , on_token ),
605673 }
606674
607- analyzer = dispatch .get (strategy , lambda : self ._analyze_text (resource .file_path , on_token ))
675+ analyzer = dispatch .get (
676+ strategy , lambda : self ._analyze_text (resource .file_path , on_token )
677+ )
608678 return analyzer ()
609679
610680 # ── Private Analyzers ─────────────────────────────────────────────
611681
612- def _analyze_image (self , image_path : str , on_token : Optional [Callable ] = None ) -> str :
682+ def _analyze_image (
683+ self , image_path : str , on_token : Optional [Callable ] = None
684+ ) -> str :
613685 """
614686 Multimodal analysis for screenshots, diagrams, and photos.
615687
@@ -685,7 +757,9 @@ def _analyze_pdf(self, file_path: str, on_token: Optional[Callable] = None) -> s
685757 log .error ("PDF analysis failed for %s: %s" , file_path , exc )
686758 return f"S T A R R Y N O T E PDF Error: { exc } "
687759
688- def _analyze_office (self , file_path : str , on_token : Optional [Callable ] = None ) -> str :
760+ def _analyze_office (
761+ self , file_path : str , on_token : Optional [Callable ] = None
762+ ) -> str :
689763 """
690764 Office document analysis (.docx, .pptx, .xlsx, .odt).
691765
@@ -744,7 +818,9 @@ def _analyze_structured(
744818 log .error ("Structured data analysis failed for %s: %s" , file_path , exc )
745819 return f"S T A R R Y N O T E Structured Data Error: { exc } "
746820
747- def _analyze_binary (self , file_path : str , on_token : Optional [Callable ] = None ) -> str :
821+ def _analyze_binary (
822+ self , file_path : str , on_token : Optional [Callable ] = None
823+ ) -> str :
748824 """
749825 Binary file analysis via metadata summarization.
750826
@@ -788,4 +864,4 @@ def _analyze_text(self, file_path: str, on_token: Optional[Callable] = None) ->
788864 return self ._format_and_stream (content = content , on_token = on_token )
789865 except Exception as exc :
790866 log .error ("Text analysis failed for %s: %s" , file_path , exc )
791- return f"S T A R R Y N O T E Text Error: { exc } "
867+ return f"S T A R R Y N O T E Text Error: { exc } "
0 commit comments