Kitana-e2e/extract.py at main · cudbg/Kitana-e2e · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import shutil
import pandas as pd
import csv
import json
from tqdm import tqdm
import random
import argparse

def collect_csv_files():
    # get teh arguments for source_dir and destination_dir
    parser = argparse.ArgumentParser(description="Collect csv files from source_dir to destination_dir")
    parser.add_argument(
        "--source_dir",
        type=str,
        required=True,
        help="Source directory containing csv files"
    )
    parser.add_argument(
        "--destination_dir",
        type=str,
        required=True,
        help="Destination directory to copy csv files"
    )
    args = parser.parse_args()
    source_dir = args.source_dir
    destination_dir = args.destination_dir

    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith('.csv'):
                src_file = os.path.join(root, file)
                dst_file = os.path.join(destination_dir, file)
                count = 1
                base, ext = os.path.splitext(file)
                while os.path.exists(dst_file):
                    dst_file = os.path.join(destination_dir, f"{base}_{count}{ext}")
                    count += 1

                try:
                    shutil.copy2(src_file, dst_file)
                    print(f"copy: {src_file} -> {dst_file}")
                except Exception as e:
                    print(f"Error in copying {src_file} {e}")


if __name__ == "__main__":
    collect_csv_files()
    # input_file = "companiesmarketcap.com - Companies ranked by earnings - CompaniesMarketCap.com.csv"
    # output_file = "companiesmarketcap.com - Companies ranked by earnings - CompaniesMarketCap.com_clean.csv"
    # remove_quotes_from_csv(input_file, output_file)

    # see_column_content(dir_path=source_dir,  csv_header_dict=csv_header)
    # num_rows_per_table = 250
    # output_dir = "/home/ec2-user/Kitana_e2e/Kitana-e2e/data/company"
    # split_and_shuffle_large_csv(csv_header_dict=csv_header, intermediate_chunk_size=1000000, final_chunk_size=100000, output_dir="/home/ec2-user/Kitana_e2e/Kitana-e2e/data/company")