Detect_AI_Generated_Text/fetch_data.py at main · guyoron1/Detect_AI_Generated_Text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pickle
import os
import requests
from tqdm import tqdm
from loguru import logger
import pandas as pd

DATASET_TYPES = ('train', 'test', 'valid')
HOME = os.getcwd()
PERSUADE_DATA_PATH = './external_sources/persuade/persuade_corpus_2.0_train.csv'
OUTFOX_DATA_PATH = os.path.join(HOME, "external_sources/outfox/data/")
KAGGLE_DATASETS = [
    {
        "url_or_identifier": "https://www.kaggle.com/datasets/conjuring92/fpe-processed-dataset",
        "path": "./external_sources/fpe"
    },
    {
        "url_or_identifier": "https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset",
        "path": "./external_sources/daigt"
    },
]

DATASET_NAME_TO_PROMPT_EXISTENCE = {
    "daigt": True,
}
def fetch_llm_data_outfox():
    """
    Fetch only the LLM-generated essays from Outfox.
    Returns a single DataFrame containing all essays.
    Assumes each pkl file contains a list of essays (strings or dicts).
    """
    OUTFOX_LLM_SOURCES = (
        'chatgpt', 'common',
        'dipper\\chatgpt',
        'dipper\\flan_t5_xxl',
        'dipper\\text_davinci_003',
        'flan_t5_xxl',
        'text_davinci_003'
    )

    all_essays = []
    for source in OUTFOX_LLM_SOURCES:
        for dtype in DATASET_TYPES:
            path = os.path.join(OUTFOX_DATA_PATH, source, dtype, f"{dtype}_lms.pkl")
            try:
                with open(path, 'rb') as file:
                    data = pickle.load(file)
                    if isinstance(data, list):
                        all_essays.extend(data)
                    elif isinstance(data, pd.DataFrame):
                        all_essays.extend(data.to_dict(orient='records'))
                    else:
                        print(f"Unknown format in {path}")
            except FileNotFoundError:
                continue

    return pd.DataFrame(all_essays)


def download_kaggle_dataset(dataset_url_or_identifier: str, download_path: str):
    """Downloads a dataset from Kaggle using the Kaggle API.

    Args:
        dataset_url_or_identifier (str): The URL of the Kaggle dataset page or dataset identifier (username/dataset_name).
        download_path (str): The local directory where the dataset will be saved.
    """
    # Extract the dataset identifier if a URL is provided
    if dataset_url_or_identifier.startswith("https://"):
        dataset_identifier = "/".join(dataset_url_or_identifier.split("/")[-2:])
    else:
        dataset_identifier = dataset_url_or_identifier

    # Ensure the download path exists
    if not os.path.exists(download_path):
        os.makedirs(download_path, exist_ok=True)

    try:
        # Use the Kaggle API to download the dataset
        import kaggle
        kaggle.api.dataset_download_files(dataset_identifier, path=download_path, unzip=True)
        print(f"Dataset downloaded and extracted to {download_path}")
    except kaggle.rest.ApiException as e:
        print(f"Failed to download dataset '{dataset_identifier}': {e}")

def download_all_kaggle_datasets():
    logger.debug("Downloading Kaggle datasets to data dir.")
    for dataset in KAGGLE_DATASETS:
        logger.debug(f"Downloading {dataset['url_or_identifier']}.")
        download_kaggle_dataset(dataset["url_or_identifier"], dataset["path"])


def download_file_with_progress(url, output_path):
    # Send a GET request with stream=True
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an error for bad status codes
    total_size = int(response.headers.get('content-length', 0))  # Total size in bytes
    block_size = 1024  # Block size (1 KB)

    # Display the progress bar
    with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
        with open(output_path, "wb") as file:
            for data in response.iter_content(block_size):
                file.write(data)
                pbar.update(len(data))  # Update the progress bar

def map_prompt_name_to_prompt_text_persuade():
    datapath = PERSUADE_DATA_PATH
    df = pd.read_csv(datapath)
    mapping_dict = df.drop_duplicates().set_index("prompt_name")["assignment"].to_dict()
    return mapping_dict


def fetch_gpt2_data():
    subdir = 'data'
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    subdir = subdir.replace('\\', '/')  # needed for Windows

    for ds in [
        'webtext',
        'small-117M', 'small-117M-k40',
        'medium-345M', 'medium-345M-k40',
        'large-762M', 'large-762M-k40',
        'xl-1542M', 'xl-1542M-k40',
    ]:
        for split in ['train', 'valid', 'test']:
            filename = ds + "." + split + '.jsonl'
            r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True)

            with open(os.path.join(subdir, filename), 'wb') as f:
                file_size = int(r.headers["content-length"])
                chunk_size = 1000
                with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
                    # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        f.write(chunk)
                        pbar.update(chunk_size)


if __name__ == '__main__':
    download_all_kaggle_datasets()