Skip to content

Commit 7102510

Browse files
CopilotMte90
andcommitted
Improve documentation and strengthen path validation logic
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 80c6bd0 commit 7102510

1 file changed

Lines changed: 18 additions & 4 deletions

File tree

ai/analyzer.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,11 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
261261
262262
Security: Validates that the resolved file path is within the project directory
263263
to prevent path traversal attacks using os.path.commonpath for cross-platform safety.
264+
265+
Note: Uses errors='replace' when reading files to handle invalid UTF-8 sequences
266+
by replacing them with the Unicode replacement character (U+FFFD) rather than
267+
dropping them silently. This preserves the file structure and makes encoding
268+
issues visible.
264269
"""
265270
conn = _connect_db(database_path)
266271
try:
@@ -281,25 +286,34 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
281286
project_path = get_project_metadata(database_path, "project_path")
282287
if not project_path:
283288
logger.error("Project path not found in metadata, cannot read file from filesystem")
284-
raise RuntimeError("Project path metadata is missing - indexing may not have completed properly")
289+
raise RuntimeError("Project path metadata is missing - ensure the indexing process has stored project metadata properly")
285290

286291
# Construct full file path and resolve to absolute path
287292
full_path = os.path.abspath(os.path.join(project_path, file_path))
288293
normalized_project_path = os.path.abspath(project_path)
289294

290295
# Security check: ensure the resolved path is within the project directory
291-
# Using os.path.commonpath for cross-platform safety (handles Windows/Unix path separators)
296+
# Using os.path.commonpath to find the longest common path.
297+
# If the common path equals the project path, and the full path starts with
298+
# the project path (or is the project path itself), then it's safe.
292299
try:
293300
common = os.path.commonpath([full_path, normalized_project_path])
301+
# Verify the common path is the project path (file is within project)
294302
if common != normalized_project_path:
295303
logger.error(f"Path traversal attempt detected: {file_path} resolves outside project directory")
296304
return None
305+
# Additional check: ensure full_path actually starts with project path
306+
# (covers edge cases where paths might be equal)
307+
if full_path != normalized_project_path and not full_path.startswith(normalized_project_path + os.sep):
308+
logger.error(f"Path traversal attempt detected: {file_path} does not start with project directory")
309+
return None
297310
except ValueError:
298-
# Different drives on Windows
299-
logger.error(f"Path traversal attempt detected: {file_path} is on a different drive")
311+
# Different drives on Windows or incompatible paths
312+
logger.error(f"Path traversal attempt detected: {file_path} is on a different drive or incompatible path")
300313
return None
301314

302315
# Read file content from filesystem
316+
# Using errors='replace' to preserve file structure and make encoding issues visible
303317
try:
304318
with open(full_path, "r", encoding="utf-8", errors="replace") as fh:
305319
content = fh.read()

0 commit comments

Comments
 (0)