CodeCrow/python-ecosystem/inference-orchestrator/src/utils/diff_processor.py at 936ee9fd64afe0bdef77049144aadb201c5ac679 · rostilos/CodeCrow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
"""
Diff Processing Utilities for Code Review.

Handles parsing, filtering, and prioritization of PR diffs.
Applies same rules as MCP server LargeContentFilter (25KB file limit).
"""

import os
import re
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum

from utils.signature_patterns import DIFF_SIGNATURE_PATTERNS

logger = logging.getLogger(__name__)


# Constants from environment or defaults (matching MCP server LargeContentFilter)
# See: org.rostilos.codecrow.mcp.filter.LargeContentFilter
DEFAULT_FILE_SIZE_THRESHOLD_BYTES = 25 * 1024  # 25KB - same as LargeContentFilter.DEFAULT_SIZE_THRESHOLD_BYTES

MAX_FILE_SIZE_BYTES = int(os.environ.get("DIFF_MAX_FILE_SIZE", str(DEFAULT_FILE_SIZE_THRESHOLD_BYTES)))
MAX_FILES_IN_DIFF = int(os.environ.get("DIFF_MAX_FILES", "400"))  # Maximum files to process
MAX_DIFF_SIZE_BYTES = int(os.environ.get("DIFF_MAX_TOTAL_SIZE", "1000000"))  # 1MB total diff size
MAX_LINES_PER_FILE = int(os.environ.get("DIFF_MAX_LINES_PER_FILE", "3000"))  # Maximum lines per file

# Placeholder message matching LargeContentFilter.FILTERED_PLACEHOLDER
FILTERED_PLACEHOLDER = "[CodeCrow Filter: file too large (>25KB), omitted from analysis]"
FILTERED_DIFF_TEMPLATE = """diff --git a/{path} b/{path}
--- a/{path}
+++ b/{path}
[CodeCrow Filter: file diff too large (>{threshold_kb}KB), omitted from analysis. File type: {diff_type}]
"""

# ── Patterns for extracting function/class signatures from diffs ──────
# Imported from utils.signature_patterns.DIFF_SIGNATURE_PATTERNS


def summarize_oversized_diff(diff_content: str, path: str, max_sigs: int = 30) -> str:
    """
    Generate a compact summary for an oversized diff instead of omitting entirely.

    Extracts:
    - Total additions / deletions
    - Added/removed/modified function and class signatures
    - Hunk headers (@@ ... @@) which contain surrounding function names

    This gives the LLM enough signal to reason about the change without
    blowing through the token budget.
    """
    added_lines = 0
    removed_lines = 0
    added_sigs = []
    removed_sigs = []
    hunk_headers = []

    for line in diff_content.split('\n'):
        if line.startswith('+') and not line.startswith('+++'):
            added_lines += 1
        elif line.startswith('-') and not line.startswith('---'):
            removed_lines += 1
        # Capture hunk headers — they often contain the enclosing function name
        elif line.startswith('@@'):
            hunk_headers.append(line.strip())

    # Extract function/class signatures from added and removed lines
    for pattern in DIFF_SIGNATURE_PATTERNS:
        for match in pattern.finditer(diff_content):
            full_match_line = diff_content[diff_content.rfind('\n', 0, match.start()) + 1:match.end()]
            sig = match.group(1).strip()
            if full_match_line.lstrip().startswith('+'):
                added_sigs.append(sig)
            elif full_match_line.lstrip().startswith('-'):
                removed_sigs.append(sig)

    # Deduplicate while preserving order
    added_sigs = list(dict.fromkeys(added_sigs))[:max_sigs]
    removed_sigs = list(dict.fromkeys(removed_sigs))[:max_sigs]
    hunk_headers = list(dict.fromkeys(hunk_headers))[:20]

    parts = [
        f"diff --git a/{path} b/{path}",
        f"--- a/{path}",
        f"+++ b/{path}",
        f"[CodeCrow Summary: diff too large for full inclusion — summary below]",
        f"",
        f"Change statistics: +{added_lines} lines added, -{removed_lines} lines removed",
    ]

    if added_sigs:
        parts.append(f"\nAdded/modified signatures ({len(added_sigs)}):")
        for sig in added_sigs:
            parts.append(f"  + {sig}")

    if removed_sigs:
        parts.append(f"\nRemoved/modified signatures ({len(removed_sigs)}):")
        for sig in removed_sigs:
            parts.append(f"  - {sig}")

    if hunk_headers:
        parts.append(f"\nAffected code regions ({len(hunk_headers)} hunks):")
        for hh in hunk_headers:
            parts.append(f"  {hh}")

    if not added_sigs and not removed_sigs and not hunk_headers:
        parts.append("\n(No recognizable function/class signatures found in diff)")

    return "\n".join(parts)


class DiffChangeType(Enum):
    """Type of change in diff."""
    ADDED = "added"
    MODIFIED = "modified"
    DELETED = "deleted"
    RENAMED = "renamed"
    BINARY = "binary"


@dataclass
class DiffFile:
    """Represents a single file in the diff."""
    path: str
    change_type: DiffChangeType
    old_path: Optional[str] = None  # For renamed files
    additions: int = 0
    deletions: int = 0
    content: str = ""  # Diff content (unified diff format)
    full_content: Optional[str] = None  # Full file content (populated separately if needed)
    hunks: List[str] = field(default_factory=list)
    is_binary: bool = False
    is_skipped: bool = False
    skip_reason: Optional[str] = None

    @property
    def total_changes(self) -> int:
        return self.additions + self.deletions

    @property
    def size_bytes(self) -> int:
        return len(self.content.encode('utf-8'))


@dataclass
class ProcessedDiff:
    """Result of processing a raw diff."""
    files: List[DiffFile]
    total_additions: int = 0
    total_deletions: int = 0
    total_files: int = 0
    skipped_files: int = 0
    truncated: bool = False
    truncation_reason: Optional[str] = None
    original_size_bytes: int = 0
    processed_size_bytes: int = 0
    refactoring_signals: List[str] = field(default_factory=list)

    def get_included_files(self) -> List[DiffFile]:
        """Get files that were not skipped."""
        return [f for f in self.files if not f.is_skipped]

    def get_skipped_files(self) -> List[DiffFile]:
        """Get files that were skipped."""
        return [f for f in self.files if f.is_skipped]

    def to_unified_diff(self) -> str:
        """Reconstruct unified diff from included files."""
        parts = []
        for f in self.get_included_files():
            parts.append(f.content)
        return "\n".join(parts)


class DiffProcessor:
    """
    Processes raw diff content with same rules as MCP server.

    Features:
    - Parses unified diff format
    - Applies file size limits
    - Applies file count limits
    - Prioritizes important files
    - Skips binary and generated files
    """

    # File patterns to skip (generated, lock files, etc.)
    SKIP_PATTERNS = [
        r'package-lock\.json$',
        r'yarn\.lock$',
        r'pnpm-lock\.yaml$',
        r'Gemfile\.lock$',
        r'poetry\.lock$',
        r'Cargo\.lock$',
        r'composer\.lock$',
        r'\.min\.(js|css)$',
        r'\.bundle\.(js|css)$',
        r'\.map$',
        r'\.snap$',
        r'__snapshots__/',
        r'\.generated\.',
        r'dist/',
        r'build/',
        r'node_modules/',
        r'vendor/',
        r'\.idea/',
        r'\.vscode/',
        r'\.git/',
    ]

    # High priority file patterns (business logic, entry points)
    HIGH_PRIORITY_PATTERNS = [
        r'(^|/)src/',
        r'(^|/)app/',
        r'(^|/)lib/',
        r'(^|/)core/',
        r'(^|/)api/',
        r'(^|/)service/',
        r'(^|/)controller/',
        r'(^|/)handler/',
        r'(^|/)model/',
        r'(^|/)entity/',
    ]

    # Low priority file patterns
    LOW_PRIORITY_PATTERNS = [
        r'test[s]?/',
        r'spec[s]?/',
        r'__test__/',
        r'\.test\.',
        r'\.spec\.',
        r'_test\.',
        r'\.md$',
        r'\.txt$',
        r'\.json$',
        r'\.yaml$',
        r'\.yml$',
        r'\.toml$',
        r'\.ini$',
        r'\.cfg$',
        r'\.conf$',
    ]

    def __init__(
        self,
        max_file_size: int = MAX_FILE_SIZE_BYTES,
        max_files: int = MAX_FILES_IN_DIFF,
        max_total_size: int = MAX_DIFF_SIZE_BYTES,
        max_lines_per_file: int = MAX_LINES_PER_FILE
    ):
        self.max_file_size = max_file_size
        self.max_files = max_files
        self.max_total_size = max_total_size
        self.max_lines_per_file = max_lines_per_file

        # Compile patterns
        self._skip_patterns = [re.compile(p, re.IGNORECASE) for p in self.SKIP_PATTERNS]
        self._high_priority = [re.compile(p, re.IGNORECASE) for p in self.HIGH_PRIORITY_PATTERNS]
        self._low_priority = [re.compile(p, re.IGNORECASE) for p in self.LOW_PRIORITY_PATTERNS]

    def process(self, raw_diff: str) -> ProcessedDiff:
        """
        Process raw diff and apply all filtering rules.

        Args:
            raw_diff: Raw unified diff content

        Returns:
            ProcessedDiff with filtered and prioritized files
        """
        if not raw_diff:
            return ProcessedDiff(files=[], original_size_bytes=0)

        original_size = len(raw_diff.encode('utf-8'))

        # Parse diff into files
        files = self._parse_diff(raw_diff)

        # Apply skip rules
        for f in files:
            if self._should_skip(f):
                f.is_skipped = True

        # Sort by priority
        files = self._prioritize_files(files)

        # Apply limits
        processed_files, truncated, truncation_reason = self._apply_limits(files)

        # Calculate stats
        total_additions = sum(f.additions for f in processed_files if not f.is_skipped)
        total_deletions = sum(f.deletions for f in processed_files if not f.is_skipped)
        total_files = len([f for f in processed_files if not f.is_skipped])
        skipped_files = len([f for f in processed_files if f.is_skipped])
        processed_size = sum(f.size_bytes for f in processed_files if not f.is_skipped)

        return ProcessedDiff(
            files=processed_files,
            total_additions=total_additions,
            total_deletions=total_deletions,
            total_files=total_files,
            skipped_files=skipped_files,
            truncated=truncated,
            truncation_reason=truncation_reason,
            original_size_bytes=original_size,
            processed_size_bytes=processed_size,
            refactoring_signals=self._detect_refactoring_signals(processed_files),
        )

    def _detect_refactoring_signals(self, files: List[DiffFile]) -> List[str]:
        """
        Detect common refactoring patterns to reduce false positives.

        Returns list of human-readable signals like:
            "File rename: old_path → new_path"
            "Balanced add/delete (~120 lines) suggests code move"
        """
        signals = []

        # 1. Explicit renames
        for f in files:
            if f.change_type == DiffChangeType.RENAMED and f.old_path:
                signals.append(f"File rename: {f.old_path} → {f.path}")

        # 2. Paired add + delete of files with same basename
        added = {f.path: f for f in files if f.change_type == DiffChangeType.ADDED}
        deleted = {f.path: f for f in files if f.change_type == DiffChangeType.DELETED}

        added_basenames = {}
        for path, f in added.items():
            bn = path.rsplit('/', 1)[-1] if '/' in path else path
            added_basenames.setdefault(bn, []).append(f)

        for del_path, del_f in deleted.items():
            bn = del_path.rsplit('/', 1)[-1] if '/' in del_path else del_path
            if bn in added_basenames:
                for add_f in added_basenames[bn]:
                    signals.append(f"Possible file move: {del_path} → {add_f.path}")

        # 3. Balanced additions/deletions across the whole PR (suggests refactoring)
        total_add = sum(f.additions for f in files if not f.is_skipped)
        total_del = sum(f.deletions for f in files if not f.is_skipped)
        if total_add > 20 and total_del > 20:
            ratio = min(total_add, total_del) / max(total_add, total_del) if max(total_add, total_del) > 0 else 0
            if ratio > 0.7:
                signals.append(
                    f"Balanced add/delete (+{total_add}/-{total_del}, ratio={ratio:.2f}) "
                    f"suggests refactoring or code move"
                )

        if signals:
            logger.info(f"Refactoring signals detected: {signals}")

        return signals

    def _parse_diff(self, raw_diff: str) -> List[DiffFile]:
        """Parse unified diff into list of DiffFile objects."""
        files = []
        current_file = None
        current_content = []

        lines = raw_diff.split('\n')
        i = 0

        while i < len(lines):
            line = lines[i]

            # New file diff section
            if line.startswith('diff --git'):
                # Save previous file
                if current_file:
                    current_file.content = '\n'.join(current_content)
                    files.append(current_file)

                # Parse file paths
                match = re.match(r'diff --git a/(.+) b/(.+)', line)
                if match:
                    old_path = match.group(1)
                    new_path = match.group(2)

                    current_file = DiffFile(
                        path=new_path,
                        old_path=old_path if old_path != new_path else None,
                        change_type=DiffChangeType.MODIFIED
                    )
                    current_content = [line]
                else:
                    current_file = None
                    current_content = []

                i += 1
                continue

            if current_file:
                current_content.append(line)

                # Detect change type
                if line.startswith('new file mode'):
                    current_file.change_type = DiffChangeType.ADDED
                elif line.startswith('deleted file mode'):
                    current_file.change_type = DiffChangeType.DELETED
                elif line.startswith('rename from'):
                    current_file.change_type = DiffChangeType.RENAMED
                elif line.startswith('Binary files'):
                    current_file.change_type = DiffChangeType.BINARY
                    current_file.is_binary = True

                # Count additions/deletions
                if line.startswith('+') and not line.startswith('+++'):
                    current_file.additions += 1
                elif line.startswith('-') and not line.startswith('---'):
                    current_file.deletions += 1

            i += 1

        # Save last file
        if current_file:
            current_file.content = '\n'.join(current_content)
            files.append(current_file)

        return files

    def _should_skip(self, file: DiffFile) -> bool:
        """Check if file should be skipped based on rules (matching LargeContentFilter)."""
        path = file.path
        threshold_kb = self.max_file_size // 1024

        # Skip binary files
        if file.is_binary:
            file.skip_reason = "Binary file"
            return True

        # Skip deleted files (no code to review)
        if file.change_type == DiffChangeType.DELETED:
            file.skip_reason = "Deleted file"
            return True

        # Skip by pattern
        for pattern in self._skip_patterns:
            if pattern.search(path):
                file.skip_reason = f"Matches skip pattern: {pattern.pattern}"
                return True

        # Skip files that are too large (matching LargeContentFilter)
        if file.size_bytes > self.max_file_size:
            file.skip_reason = f"File too large: {file.size_bytes} bytes > {self.max_file_size}"
            # Generate a compact summary instead of fully omitting the diff.
            # This preserves function/class signatures and line counts so the
            # LLM can still reason about the change without token overflow.
            file.content = summarize_oversized_diff(file.content, path)
            return True

        # Skip files with too many lines
        line_count = file.content.count('\n')
        if line_count > self.max_lines_per_file:
            file.skip_reason = f"Too many lines: {line_count} > {self.max_lines_per_file}"
            file.content = summarize_oversized_diff(file.content, path)
            return True

        return False

    def _get_priority(self, file: DiffFile) -> int:
        """
        Get priority score for file (lower = higher priority).

        Returns:
            0 = High priority
            1 = Medium priority
            2 = Low priority
        """
        path = file.path

        # Check high priority patterns
        for pattern in self._high_priority:
            if pattern.search(path):
                return 0

        # Check low priority patterns
        for pattern in self._low_priority:
            if pattern.search(path):
                return 2

        return 1  # Medium priority

    def _prioritize_files(self, files: List[DiffFile]) -> List[DiffFile]:
        """Sort files by priority, keeping non-skipped files first."""
        def sort_key(f: DiffFile) -> Tuple[int, int, int]:
            # (skipped, priority, -changes)
            # Non-skipped first, then by priority, then by number of changes (desc)
            return (
                1 if f.is_skipped else 0,
                self._get_priority(f),
                -f.total_changes
            )

        return sorted(files, key=sort_key)

    def _apply_limits(self, files: List[DiffFile]) -> Tuple[List[DiffFile], bool, Optional[str]]:
        """
        Apply file count and total size limits.

        Returns:
            (files, truncated, truncation_reason)
        """
        truncated = False
        truncation_reason = None

        included_count = 0
        total_size = 0

        for f in files:
            if f.is_skipped:
                continue

            # Check file count limit
            if included_count >= self.max_files:
                f.is_skipped = True
                f.skip_reason = f"Exceeds max files limit: {self.max_files}"
                truncated = True
                truncation_reason = f"Diff truncated: exceeded {self.max_files} files limit"
                continue

            # Check total size limit
            if total_size + f.size_bytes > self.max_total_size:
                f.is_skipped = True
                f.skip_reason = f"Would exceed total size limit: {self.max_total_size}"
                truncated = True
                truncation_reason = f"Diff truncated: exceeded {self.max_total_size} bytes total size"
                continue

            included_count += 1
            total_size += f.size_bytes

        return files, truncated, truncation_reason


def process_raw_diff(raw_diff: Optional[str]) -> ProcessedDiff:
    """
    Convenience function to process raw diff with default settings.

    Args:
        raw_diff: Raw unified diff content or None

    Returns:
        ProcessedDiff object
    """
    if not raw_diff:
        return ProcessedDiff(files=[], original_size_bytes=0)

    processor = DiffProcessor()
    return processor.process(raw_diff)


def format_diff_for_prompt(
    processed_diff: ProcessedDiff,
    include_stats: bool = True,
    max_chars: Optional[int] = None
) -> str:
    """
    Format processed diff for inclusion in LLM prompt.

    Args:
        processed_diff: ProcessedDiff from processor
        include_stats: Whether to include statistics header
        max_chars: Optional character limit

    Returns:
        Formatted diff string
    """
    parts = []

    if include_stats:
        parts.append(f"=== DIFF STATISTICS ===")
        parts.append(f"Files changed: {processed_diff.total_files}")
        parts.append(f"Additions: +{processed_diff.total_additions}")
        parts.append(f"Deletions: -{processed_diff.total_deletions}")
        if processed_diff.skipped_files > 0:
            parts.append(f"Files skipped: {processed_diff.skipped_files}")
        if processed_diff.truncated:
            parts.append(f"⚠️ {processed_diff.truncation_reason}")
        parts.append("")

    # Add file list
    included_files = processed_diff.get_included_files()
    if included_files:
        parts.append("=== CHANGED FILES ===")
        for f in included_files:
            change_symbol = {
                DiffChangeType.ADDED: "A",
                DiffChangeType.MODIFIED: "M",
                DiffChangeType.DELETED: "D",
                DiffChangeType.RENAMED: "R",
                DiffChangeType.BINARY: "B",
            }.get(f.change_type, "?")
            parts.append(f"  [{change_symbol}] {f.path} (+{f.additions}/-{f.deletions})")
        parts.append("")

    # Add actual diff content
    parts.append("=== DIFF CONTENT ===")
    diff_content = processed_diff.to_unified_diff()

    # Apply character limit if needed
    if max_chars and len(diff_content) > max_chars:
        diff_content = diff_content[:max_chars] + "\n... (truncated)"

    parts.append(diff_content)

    return "\n".join(parts)