@@ -261,6 +261,11 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
261261
262262 Security: Validates that the resolved file path is within the project directory
263263 to prevent path traversal attacks using os.path.commonpath for cross-platform safety.
264+
265+ Note: Uses errors='replace' when reading files to handle invalid UTF-8 sequences
266+ by replacing them with the Unicode replacement character (U+FFFD) rather than
267+ dropping them silently. This preserves the file structure and makes encoding
268+ issues visible.
264269 """
265270 conn = _connect_db (database_path )
266271 try :
@@ -281,25 +286,34 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
281286 project_path = get_project_metadata (database_path , "project_path" )
282287 if not project_path :
283288 logger .error ("Project path not found in metadata, cannot read file from filesystem" )
284- raise RuntimeError ("Project path metadata is missing - indexing may not have completed properly" )
289+ raise RuntimeError ("Project path metadata is missing - ensure the indexing process has stored project metadata properly" )
285290
286291 # Construct full file path and resolve to absolute path
287292 full_path = os .path .abspath (os .path .join (project_path , file_path ))
288293 normalized_project_path = os .path .abspath (project_path )
289294
290295 # Security check: ensure the resolved path is within the project directory
291- # Using os.path.commonpath for cross-platform safety (handles Windows/Unix path separators)
296+ # Using os.path.commonpath to find the longest common path.
297+ # If the common path equals the project path, and the full path starts with
298+ # the project path (or is the project path itself), then it's safe.
292299 try :
293300 common = os .path .commonpath ([full_path , normalized_project_path ])
301+ # Verify the common path is the project path (file is within project)
294302 if common != normalized_project_path :
295303 logger .error (f"Path traversal attempt detected: { file_path } resolves outside project directory" )
296304 return None
305+ # Additional check: ensure full_path actually starts with project path
306+ # (covers edge cases where paths might be equal)
307+ if full_path != normalized_project_path and not full_path .startswith (normalized_project_path + os .sep ):
308+ logger .error (f"Path traversal attempt detected: { file_path } does not start with project directory" )
309+ return None
297310 except ValueError :
298- # Different drives on Windows
299- logger .error (f"Path traversal attempt detected: { file_path } is on a different drive" )
311+ # Different drives on Windows or incompatible paths
312+ logger .error (f"Path traversal attempt detected: { file_path } is on a different drive or incompatible path " )
300313 return None
301314
302315 # Read file content from filesystem
316+ # Using errors='replace' to preserve file structure and make encoding issues visible
303317 try :
304318 with open (full_path , "r" , encoding = "utf-8" , errors = "replace" ) as fh :
305319 content = fh .read ()
0 commit comments