RepoReasoner/related_files_collection.py at main · DeepSoftwareAnalytics/RepoReasoner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import json
import os
from pathlib import Path
import sys

# You need to install this library first: pip install rank-bm25
try:
    from rank_bm25 import BM25Okapi
except ImportError:
    print("Error: Dependency 'rank-bm25' is not installed. Please run 'pip install rank-bm25' to install it.", file=sys.stderr)
    sys.exit(1)

# Cache corpus for each repository to avoid rebuilding
CORPUS_CACHE = {}

# Define a set of directory names to exclude globally, for easier management
EXCLUDED_DIR_NAMES = {
    # User-specified
    'venv', 'node_modules', 'build', 'dist', 'site-packages', 'tests', 'test', 'testing',
    # Common irrelevant directories
    '.venv', '.git', '.hg', 'docs', 'examples', 'samples', 'scripts'
}

def create_corpus_for_repo(repo_path: Path):
    """
    Build a corpus for a single repository, excluding test and irrelevant files/directories.
    Returns a list of file paths and a list of tokenized content.
    """
    repo_key = str(repo_path)
    if repo_key in CORPUS_CACHE:
        return CORPUS_CACHE[repo_key]

    print(f"  - Building corpus for repository '{repo_path.name}'...")
    doc_paths = []
    tokenized_corpus = []

    all_py_files = list(repo_path.rglob("*.py"))

    for file_path in all_py_files:
        path_parts = {p.lower() for p in file_path.parts}

        if not EXCLUDED_DIR_NAMES.isdisjoint(path_parts):
            continue

        if file_path.name in ("__init__.py", "setup.py", "conftest.py"):
            continue

        try:
            content = file_path.read_text(encoding="utf-8")
            tokenized_content = content.split()

            if tokenized_content:
                doc_paths.append(file_path.relative_to(repo_path).as_posix())
                tokenized_corpus.append(tokenized_content)
        except Exception:
            continue

    print(f"  - Corpus built successfully, containing {len(doc_paths)} source files.")
    result = (doc_paths, tokenized_corpus)
    CORPUS_CACHE[repo_key] = result
    return result

def analyze_with_bm25(input_jsonl, output_jsonl, repos_base_dir):
    """
    Process a single jsonl file using BM25, with caching for repeated queries.
    """
    reponame = Path(input_jsonl).stem
    repo_root = Path(repos_base_dir) / "python_repos" / reponame

    if not repo_root.is_dir():
        print(f"Error: Repository directory '{repo_root}' for '{reponame}' not found.", file=sys.stderr)
        return

    corpus_paths, tokenized_corpus = create_corpus_for_repo(repo_root)

    if not corpus_paths:
        print(f"Warning: Corpus for repository '{reponame}' is empty, skipping processing.", file=sys.stderr)
        return

    bm25 = BM25Okapi(tokenized_corpus)

    # ✨ Cache BM25 retrieval results for each testpath
    # Key is testpath, value is the list of retrieved relevant files
    query_results_cache = {}

    with open(input_jsonl, 'r', encoding='utf-8') as infile, \
         open(output_jsonl, 'w', encoding='utf-8') as outfile:

        print(f"  - Starting to process entries in {reponame}...")
        for line in infile:
            try:
                data = json.loads(line)
                testpath = data.get("testpath")

                if not testpath:
                    continue

                # ✨ Core logic: Check cache
                if testpath in query_results_cache:
                    # If result for this test file is already cached, use it directly
                    ranked_files = query_results_cache[testpath]
                else:
                    # If encountering this test file for the first time, perform retrieval
                    # print(f"    - Performing BM25 retrieval for new file '{testpath}'...") # (Optional) Debug print
                    query_test_path = repo_root / testpath

                    if not query_test_path.is_file():
                        ranked_files = []
                    else:
                        query_content = query_test_path.read_text(encoding="utf-8")
                        tokenized_query = query_content.split()
                        ranked_files = bm25.get_top_n(tokenized_query, corpus_paths, n=len(corpus_paths))

                    # ✨ Store the new result in cache for future use
                    query_results_cache[testpath] = ranked_files

                # Construct and write output data
                output_data = {
                    "task_id": data.get("task_id"),
                    "reponame": data.get("reponame"),
                    "testpath": data.get("testpath"),
                    "testname": data.get("testname"),
                    "funcname": data.get("funcname"),
                    "related_files_rank": ranked_files
                }

                outfile.write(json.dumps(output_data) + '\n')

            except (json.JSONDecodeError, KeyError, FileNotFoundError) as e:
                # Print error message for debugging, but continue processing
                print(f"    - Warning: Error processing line (error: {e}), skipped. Line content: {line.strip()}", file=sys.stderr)
                continue

def main():
    """
    Main logic for batch processing.
    """
    # ✨ Suggestion: Use absolute paths or paths relative to the script for robustness
    script_dir = Path(__file__).parent
    REPOS_BASE_DIR = script_dir
    SOURCE_DATA_DIR = script_dir / 'Data_RepoReasoner' / 'original'
    OUTPUT_DIR = script_dir / 'output_with_bm25_rank'

    if not SOURCE_DATA_DIR.is_dir():
        print(f"Error: Input directory '{SOURCE_DATA_DIR}' does not exist.", file=sys.stderr)
        return

    OUTPUT_DIR.mkdir(exist_ok=True)

    files_to_process = [f for f in os.listdir(SOURCE_DATA_DIR) if f.endswith('.jsonl')]

    print(f"Found {len(files_to_process)} files to process, output will be saved to '{OUTPUT_DIR}'...")

    for filename in files_to_process:
        input_path = SOURCE_DATA_DIR / filename
        output_path = OUTPUT_DIR / filename

        print(f"\n[+] Processing: {filename}")
        analyze_with_bm25(str(input_path), str(output_path), str(REPOS_BASE_DIR))
        print(f"[✓] Completed: Results saved to {output_path}")

    print("\nAll files processed!")

if __name__ == '__main__':
    main()