Detect_AI_Generated_Text/format.py at main · guyoron1/Detect_AI_Generated_Text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# To format data to competition format - including creation of prompts based on promptless ones
import argparse
import json
import os
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import numpy as np
from loguru import logger
import fetch_data
GLOBAL_PIPE = pipeline("text2text-generation", model="google/flan-t5-large")
dataset_version = "v01-05-2025" # Versioning of dataset.

DATASET_NAME_TO_PATH = {
    'daigt': './external_sources/daigt/train_v2_drcat_02.csv',
    'persuade': './external_sources/persuade/persuade_corpus_2.0_train.csv',
    'fpe': './external_sources/fpe',
    'outfox': './external_sources/outfox/data'
}

REQUIRED_COLS = (
    'prompt_name', # Identifier for prompt in plaintext.
    'prompt_text', # Actual prompt.
    'essay', # Essay writted by student / LLM.
    'is_prompt_llm_generated', # For datasets that lack prompts - whether the prompt was generated by us in reverse.
    'generated', # Classification label - 0 by student, 1 by LLM (target)
    )


def format_all_datasets(read_from_existing = None):
    logger.debug('Running format all datasets.')
    if read_from_existing: #read_from_existing is path
        df = load_datasets_from_pickle(read_from_existing)
        return df
    all_dataframes = []
    for name, _ in DATASET_NAME_TO_PATH.items():
        logger.debug(f'Formatting {name}.')
        df = format_dataset(name)
        all_dataframes.append(df)
    concatenated_df = pd.concat(all_dataframes, ignore_index=True)
    concatenated_df.reset_index(drop=True, inplace=True)
    joined_dataset_names = '-'.join(DATASET_NAME_TO_PATH.keys())
    dataset_name = f"./data/training_data_version_{dataset_version}_sources_{joined_dataset_names}.pickle"
    concatenated_df.to_pickle(dataset_name)
    return concatenated_df

def format_dataset(dataset_name):
    """
    Given dataset identifier return df with data as formatted for competition.
    """
    path = DATASET_NAME_TO_PATH[dataset_name]
    plain_df = dataset_to_pandas(dataset_name, path)
    logger.debug(f'Adding prompts to {dataset_name}.')
    df_with_prompts = add_prompts(plain_df) # Fill in prompts with LLM pipeline (if needed)
    return df_with_prompts


def dataset_to_pandas(dataset_name, path):
    # Divide into cases by datasets.
    if dataset_name == 'daigt':
        df = format_daigt_to_df(path)
    elif dataset_name == 'persuade':
        df = format_persuade_to_df(path)
    elif dataset_name == 'fpe':
        df = format_fpe_to_df(path)
    elif dataset_name == 'outfox':
        df = format_outfox_to_df(path)
    df = df.drop_duplicates(subset='essay_text')
    return df

def format_daigt_to_df(path):
    df = pd.read_csv(path)
    df = df.rename(columns={'text':'essay_text','label':'generated'})
    df = df[df['RDizzl3_seven'] == True] # Filtering present in Daigt2.0 that says whether the essay was written in response to an original prompt from the challange.
    df = df.drop(columns=['RDizzl3_seven','source'])
    mapping = fetch_data.map_prompt_name_to_prompt_text_persuade()
    df['prompt_text'] = df['prompt_name'].map(mapping)
    df['source'] = 'daigt'
    return df

def format_persuade_to_df(path):
    df = pd.read_csv(path)
    df = df.rename(columns={'full_text':'essay_text','assignment':'prompt_text'})
    df = df[['prompt_name','prompt_text','essay_text']]
    df.drop(columns=['prompt_name'], inplace=True)  # Drop the 'prompt_name' column
    df['generated'] = 0
    df['source'] = 'persuade'
    return df


def format_outfox_to_df(path):
    """
    Path consists of "chatgpt", "common"... different LLM generators.
    Will only use 'chatgpt', 'flan' and 'davinci' for now, as LLM generated.
    """
    llm_pkl_files_train = []
    master_df = pd.DataFrame(columns=['prompt_text', 'essay_text', 'generated'])
    problem_statements = pd.read_pickle(f"{path}/common/train/train_problem_statements.pkl")
    human_responses = pd.read_pickle(f"{path}/common/train/train_humans.pkl")
    for dirpath, dirnames, filenames in os.walk(path):
        if 'common' in dirpath:
            continue
        for filename in filenames:
            if filename == 'train_lms.pkl':
                full_path = os.path.join(dirpath, filename)
                llm_pkl_files_train.append(full_path)
    for pkl in llm_pkl_files_train:
        curr_list = pd.read_pickle(pkl)
        curr_df = df = pd.DataFrame({
            'prompt_text': problem_statements,
            'essay_text': curr_list,
            'generated': 1  # or '1' as a string, if you prefer
        })
        master_df = pd.concat([master_df,df], ignore_index=True)

    # Adding human responses with other label.
    df_human = pd.DataFrame({
        'prompt_text': problem_statements,
        'essay_text': human_responses,
        'generated': 0
    })
    master_df = pd.concat([master_df, df_human], ignore_index=True)
    return master_df

def format_fpe_to_df(path):
    dataframes = {}
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        df_name = os.path.splitext(file)[0]  # Use the filename (without extension) as the key
        if file.endswith(".csv"):
            if df_name == 't5_essays_processed' or df_name == 'mlm_essays_processed':
                dataframes[df_name] = pd.read_csv(file_path)
        elif file.endswith(".parquet") and not(file.startswith('cv')):
            df = pd.read_parquet(file_path)
            dataframes[df_name] = df

    # Format datasets
    for name, df in dataframes.items():
        if name == 't5_essays_processed':
            df = df[['essay_text']]
            df['generated'] = 1
        elif name == 'mlm_essays_processed':
            df = df.rename(columns={'prompt':'prompt_text'})
            df = df[['essay_text', 'prompt_text']]
            df['generated'] = 1
        elif name.startswith("fpe"):
            df = df[['essay_text']]
            df['generated'] = 0
        else:
            print('a')
        dataframes[name] = df
    result = pd.concat(
        [df.assign(source=key) for key, df in dataframes.items()],
        ignore_index=True,
        sort=False  # Align columns automatically, filling missing ones with NaN
    )
    return result


def add_prompts(df: pd.DataFrame, batch_size: int = 8):
    """
    Fills in missing 'prompt_text' in the DataFrame using an LLM pipeline in batches.

    Args:
        df (pd.DataFrame): DataFrame with 'essay_text' and optionally 'prompt_text'.
        batch_size (int): Number of rows to process in each batch.

    Returns:
        pd.DataFrame: Updated DataFrame with 'prompt_text' filled where missing.
    """
    # Determine rows that need filling
    fill_all = 'prompt_text' not in df.columns
    if fill_all:
        df['prompt_text'] = None  # Add the missing column

    # Identify rows where 'prompt_text' is NaN or fill_all is True
    rows_to_fill = df.index[df['prompt_text'].isna()] if not fill_all else df.index

    if rows_to_fill.empty:
        return df  # No missing prompts to fill

    # Process in batches
    for start_idx in tqdm(
            range(0, len(rows_to_fill), batch_size),
            desc="Filling prompts in batches",
            total=(len(rows_to_fill) + batch_size - 1) // batch_size,
    ):
        batch_indices = rows_to_fill[start_idx:start_idx + batch_size]
        batch_texts = df.loc[batch_indices, 'essay_text'].tolist()

        # Generate prompts for the batch
        generated_prompts = generate_prompts_for_texts(batch_texts, GLOBAL_PIPE, batch_size=batch_size)

        # Update the DataFrame with generated prompts
        df.loc[batch_indices, 'prompt_text'] = generated_prompts

    return df

def filter_and_sample_merged_df(df:pd.DataFrame, sample_size=10_000):
    only_generated_df = df[
        (df['generated'] == 1) &  # Only generated essays
        (df['prompt_text'].notnull()) &  # Remove rows with NaN in 'prompt_text'
        (df['prompt_text'].str.strip() != "")  # Remove rows with blank/empty prompt_text
        ]
    only_student_df = df[
        (df['generated'] == 0) &  # Only student essays
        (df['prompt_text'].notnull()) &  # Remove rows with NaN in 'prompt_text'
        (df['prompt_text'].str.strip() != "")  # Remove rows with blank/empty prompt_text
        ]

    num_samples = min(sample_size, len(only_generated_df),len(only_student_df))
    only_generated_df = only_generated_df.sample(n=num_samples, random_state=71)
    only_student_df = only_student_df.sample(n=num_samples, random_state=71)

    merged_filtered_df = pd.concat([only_generated_df, only_student_df]).sample(frac=1, random_state=71).reset_index(drop=True)

    return merged_filtered_df


def generate_prompts_for_texts(texts, pipe, max_input_tokens=512, batch_size=16):
    """
    Generates prompts for a batch of texts, ensuring only the middle text is truncated.

    Args:
        texts (list of str): List of input texts.
        pipe: Text-to-text generation pipeline.
        max_input_tokens (int): Maximum tokens for the input.
        batch_size (int): Batch size.

    Returns:
        list of str: List of generated prompts.
    """
    from transformers import AutoTokenizer
    import torch

    # Load the tokenizer
    tokenizer = pipe.tokenizer if hasattr(pipe, 'tokenizer') else AutoTokenizer.from_pretrained(pipe.model.name_or_path)

    prompt_prefix = "This is an essay written by a student in response to an assignment: \n"
    prompt_suffix = "\n Write the assignment that you think was given to the student, and phrase your answer as a question."

    # Tokenize the prefix and suffix separately
    prefix_ids = tokenizer(prompt_prefix, return_tensors="pt").input_ids[0]
    suffix_ids = tokenizer(prompt_suffix, return_tensors="pt").input_ids[0]
    available_tokens = max_input_tokens - (len(prefix_ids) + len(suffix_ids))

    truncated_inputs = []
    for text in texts:
        # Tokenize and truncate the middle text
        text_ids = tokenizer(text, truncation=True, max_length=available_tokens, return_tensors="pt").input_ids[0]
        # Reconstruct the input
        truncated_input_ids = torch.cat([prefix_ids, text_ids, suffix_ids], dim=0)
        truncated_input = tokenizer.decode(truncated_input_ids, skip_special_tokens=True)
        truncated_inputs.append(truncated_input)

    # Generate prompts in batch
    generated = pipe(
        truncated_inputs,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.8,
        batch_size=batch_size,
    )

    return [g['generated_text'].strip() for g in generated]


def sample_by_percentages(df: pd.DataFrame, percentages: dict) -> pd.DataFrame:
    """
    Samples the dataset based on the given percentages for each source.

    Args:
        df (pd.DataFrame): The input dataset with a 'source' column.
        percentages (dict): A dictionary mapping sources to their respective percentages (0-1).

    Returns:
        pd.DataFrame: A new DataFrame with the sampled data.
    """
    sampled_data = []

    for source, percentage in percentages.items():
        # Filter rows belonging to the current source
        source_df = df[df['source'] == source]

        # Calculate the number of samples
        num_samples = int(len(source_df) * percentage)

        # Sample the data
        sampled_df = source_df.sample(n=num_samples, random_state=42)  # Setting random_state for reproducibility

        # Add the sampled data to the list
        sampled_data.append(sampled_df)

    # Concatenate all sampled dataframes
    result_df = pd.concat(sampled_data, ignore_index=True)
    return result_df


def load_datasets_from_pickle(filename):
    path = f"./data/{filename}.pickle"
    unpickled_df = pd.read_pickle(path)
    return unpickled_df

# def merge_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
#     """
#     Merges two DataFrames with the same structure into one.
#     Ensures a seamless concatenation for fine-tuning purposes.
#
#     Args:
#         df1 (pd.DataFrame): The first DataFrame.
#         df2 (pd.DataFrame): The second DataFrame.
#
#     Returns:
#         pd.DataFrame: A single DataFrame containing all rows from both inputs.
#     """
#     merged_df = pd.concat([df1, df2], ignore_index=True)
#     return merged_df


if __name__ == '__main__':
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--download_kaggle",
        action = 'store_true',
        default = False,
    )
    args = argparser.parse_args()
    if args.download_kaggle:
        fetch_data.download_all_kaggle_datasets()

    format_all_datasets()