forked from ManchesterBioinference/mRNA_LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparams.yaml
More file actions
203 lines (174 loc) · 5.28 KB
/
params.yaml
File metadata and controls
203 lines (174 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
setupEnv:
name: rayTune
downloadSpeciesData:
output_dir: data/downloaded
isolate3UTRs:
output_dir: data/3UTRs
transcriptSection: three_prime_utr,CDS,five_prime_utr
preprocessFlyUTRs:
output_dir: output/data/flyUTRs
kmer: 4
oma_groups: data/oma-groups.txt.gz
oma_ncbi_map: data/oma-ncbi.txt.gz
#ncbi_flybase_map: "data/id_validation_table_ncbi_flybase.txt"
ncbi_geneID_map: data/gene2refseq_Dmel.txt
geneID_flybase_map: data/GeneID_to_FlybaseID.txt
#maxSeqLen: null
trainToFlyUTRs:
test_run: false
do_train: true
do_eval: false
do_predict: false
do_visualize: false
output_dir: output/flyTrained
kmer: 4
model_type: 3utrfly
should_continue: false
model_name_or_path: data/4-new-12w-0
tokenizer_name: rna4
evaluate_during_training: true
do_visualize_during_training: false
numEpochsBeforeEarlyStopping: 20
patience: 10
per_gpu_train_batch_size: 48
num_train_epochs: 100
learning_rate: 5e-6
beta1: 0.9
beta2: 0.98
adam_epsilon: 1e-6
weight_decay: 0.01
warmup_percent: 0.1
seed: 6
eval_all_checkpoints: false
no_cuda: false
logging_steps: 100
save_steps: 1000
save_total_limit: 2
overwrite_output_dir: true
neptune_token:
neptune_project:
overwrite_cache: false
do_lower_case: false
curriculumLearning: false
mergeUTRsAndDecayRates:
decay_rates: "data/decay/parameters_estimates_extended_zygotic_tr_18022025_filtered.csv" #"data/results_zygotic_tr_extended_filtered.csv" #"data/maternal-halflifes_modified_filtered.csv"
output_file: "output/data/utrDecayRates.fasta"
isolateFullTranscripts:
output_file: output/data/full_transcripts/Dmela-full_transcripts.fa
getDecayResiduals:
codon_output: output/data/codons/codon_frequencies.pkl
residuals: output/data/codons/decayResiduals.fasta
extraFeatures: output/data/codons/extraFeatures.csv
runViennaRNA:
output_csv: output/data/codons/vienna_features.csv
condaPath: /mnt/mr01-home01/m65338lb/.local/bin/micromamba
maxWorkers: 8
# RNAfold_path: "/mnt/mr01-home01/m65338lb/.local/share/mamba/envs/inseq/bin/RNAfold"
preprocessData:
output_dir: output/data/decay
kmer: 4
fineTuneModel:
output_dir: output/ftModel
do_train: true
do_eval: false
do_predict: false
do_visualize: false
predict:
output_dir: output/predict
do_train: false
do_eval: false
do_predict: true
do_visualize: false
ablate_features:
output_dir: output/data/ablations
modelParams:
model_type: 3utrfly
task_name: rnadecay
should_continue: false
evaluate_during_training: true
do_visualize_during_training: false
patience: 5
per_gpu_train_batch_size: 48
per_gpu_eval_batch_size: 48
num_train_epochs: 50
warmup_percent: 0.1
seed: 6
eval_all_checkpoints: false
no_cuda: false
logging_steps: 100
save_steps: 100000
save_total_limit: 10
overwrite_output_dir: true
neptune_token:
neptune_project:
overwrite_cache: false
do_lower_case: false
rayTune:
ray_tune_samples: 100
ray_tune_max_epochs: 50
ray_tune_initial_points: 5
ray_tune_grace_period: 4
ray_tune_reduction_factor: 2
ray_tune_cpu_per_trial: 2
ray_tune_gpu_per_trial: 1.0
ray_tune_local_dir: output/ray_results
importanceAnalysis:
output_dir: output/importance
pickle_file: shap.pkl
debug: false
visualizeImportance:
scoresOnly: shapScores.npy
save_path: visualizeImportance.html
save_tokenized_path: visualizeImportance_tokenized.html
findMotifs:
#p_adjust: "fdr_bh"
output_dir: output/motifs
control_file: control_seqs.fasta
motif_file: motif_seqs.fasta
control_positions: control_positions.txt
motif_positions: motif_positions.txt
interest_file: interest_seqs.fasta
motifEnrichment:
output_dir: output/motifEnrichment
motifEnrichment_highLowDecay:
output_dir: output/motifEnrichment_highLow
prepRandomizeSeqsAndExtraFeatures:
sequence_dir: output/data/sanityCheck
seq_file: dev
randomizeSeqsAndExtraFeatures:
per_gpu_pred_batch_size: 128
splitHighLowDecay:
output: output/splitDecay
high: highDecay.fasta
low: lowDecay.fasta
prepForMAST:
output_dir: output/data/MEME_suite
output_fasta: TEseqs_split_by_utr.fasta
getMASTBackgroundFile:
output_file: nucFrequencies.txt
runMAST:
output_dir: output/mast_out
output_file: mast_results.txt
motif_file: /opt/meme/share/meme-5.5.7/db/motif_databases/RNA/Ray2013_rbp_Drosophila_melanogaster.meme
mast_pvalue_threshold: 0.05
codonOnlyClassificationMLP:
output_dir: output/codonOnlyClassificationMLP
label: codonOnly
patience: 7
per_device_batch_size: 48
num_train_epochs: 100
allExtraFeaturesClassificationMLP:
label: allExtraFeatures
extraFeaturesAndMotifCountsClassificationMLP:
label: extraFeaturesAndMotifCounts
rayTuneClassificationMLP:
# Ray Tune specific parameters
ray_tune_samples: 300 # Number of hyperparameter combinations to try
ray_tune_max_epochs: 100 # Maximum epochs per trial
ray_tune_initial_points: 30 # Maximum epochs per trial
ray_tune_grace_period: 5 # Minimum epochs before early stopping
ray_tune_reduction_factor: 2 # ASHA scheduler reduction factor
ray_tune_cpu_per_trial: 1 # CPUs allocated per trial
ray_tune_gpu_per_trial: 0 # GPUs allocated per trial
ray_tune_local_dir: "output/ray_results_MLP" # Directory for Ray Tune results
train_final_model: true # Whether to train final model with best hyperparameters