MpraScreen/mixed_setup.py at main · fl16180/MpraScreen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
''' This is not currently used but code could be useful
'''

import argparse
import os
import numpy as np
import pandas as pd

from constants import *
from datasets import load_data_set, load_neighbors_set
from utils.data_utils import split_train_test


def get_e116_pos():
    mpra_data = load_data_set(
        'mpra_e116', split='all', datasets=['roadmap', 'eigen', 'regbase'],
        make_new=False
    )
    mpra_neighbors = load_neighbors_set(
        'mpra_e116', split='all', tissue='E116'
    )

    mpra_pos = mpra_data[mpra_data.Label == 1]
    neigh_pos = mpra_neighbors[mpra_data.Label == 1]
    print('E116 positives: ', mpra_pos.shape[0])
    return mpra_pos, neigh_pos


def get_nova_pos():
    mpra_data = load_data_set(
        'mpra_nova', split='all', datasets=['roadmap', 'eigen', 'regbase'],
        make_new=False
    )
    mpra_neighbors = load_neighbors_set(
        'mpra_nova', split='all', tissue='E116'
    )
    thresh = 0.05    # use FDR 0.05 intersection

    # use index to match duplications/reorderings for neighbor data
    mpra_data['INDEX'] = np.arange(len(mpra_data))
    mpra_data = merge_with_validation_info(mpra_data, 'mpra_nova')
    mpra_neighbors = mpra_neighbors[mpra_data['INDEX'].values, :]

    # define functional as intersection of expression and allelic
    expr_sig = mpra_data['padj_expr'] < thresh
    allele_sig = mpra_data['padj_allele'] < thresh
    mpra_data['Label'] = np.logical_and(expr_sig, allele_sig).astype(int)

    # # get duplication mask and drop duplicates
    # dupls = mpra_data.duplicated(subset=['chr', 'pos', 'Label'])
    # mpra_data = mpra_data.loc[~dupls, :]
    # mpra_neighbors = mpra_neighbors[~dupls, :]

    print('NovaSeq2 positives:', mpra_data['Label'].sum())
    print('expr pos: ', expr_sig.sum())
    print('allelic pos: ', allele_sig.sum())

    mpra_data.drop(['Pool', 'pvalue_expr', 'padj_expr', 'pvalue_allele',
                    'padj_allele'], axis=1, inplace=True)

    mpra_pos = mpra_data[mpra_data.Label == 1]
    neigh_pos = mpra_neighbors[mpra_data.Label == 1]
    return mpra_pos, neigh_pos


def merge_with_validation_info(scores, eval_proj):
    """ Load full mpra_nova dataset in order to get full set of labels,
    i.e. the expression and allelic p-values.

    Join with scores dataframe using chr and pos columns.
    """
    ext_val_path = MPRA_DIR / MPRA_TABLE[eval_proj][0]
    dat = pd.read_csv(ext_val_path, sep='\t')
    dat.rename(columns={'chrom': 'chr', 'Hit': 'Label'}, inplace=True)
    dat = dat.loc[:,
        ['chr', 'pos', 'Pool', 'pvalue_expr',
        'padj_expr', 'pvalue_allele', 'padj_allele', 'Label']
    ]
    dat['chr'] = dat['chr'].map(lambda x: x[3:])
    dat[['chr', 'pos']] = dat[['chr', 'pos']].astype(int)

    scores.drop('Label', axis=1, inplace=True)
    scores = pd.merge(dat, scores, on=['chr', 'pos'])
    return scores


def add_matched_variants(bed):
    return

if __name__ == '__main__':

    save_files = False

    e116_pos, e116_neigh_pos = get_e116_pos()
    nova_pos, nova_neigh_pos = get_nova_pos()

    used_cols = e116_pos.columns.tolist()
    nova_pos = nova_pos.loc[:, used_cols].copy()

    all_pos = pd.concat([e116_pos, nova_pos], axis=0)
    all_pos = all_pos.loc[:, ['chr', 'pos', 'Label']]


    train_pos, test_pos = split_train_test(all_pos, test_frac=0.15, seed=0)

    if save_files:
        all_pos.to_csv(PROCESSED_DIR / 'gnom_mpra_mixed' / 'all_pos2.csv', index=False, header=False, sep=',')
        train_pos.to_csv(PROCESSED_DIR / 'gnom_mpra_mixed' / 'train_pos2.csv', index=False, header=False, sep=',')
        test_pos.to_csv(PROCESSED_DIR / 'gnom_mpra_mixed' / 'test_pos2.csv', index=False, header=False, sep=',')