orenavram · tehilayehudai · Feb 6, 2020 · Feb 6, 2020 · Feb 13, 2020 · Feb 14, 2020
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,6 @@
+.*
+__pycache_
+docker
+test*
+output*
+# mock_data
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+.venv
+.vscode
+__pycache__
+test*
+output*
+
+UnitePSSMs/UnitePSSMs
+PSSM_score_Peptide/PSSM_score_Peptide
+hits_cpp/hits
+tfidf/tfidf
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:18.04
+
+RUN apt update -y && \
+    apt install -y python3 python3-pip python3-venv && \
+    apt install -y g++ && \
+    apt install -y zlib1g-dev && \
+    apt install -y wget
+
+RUN wget https://github.com/weizhongli/cdhit/releases/download/V4.8.1/cd-hit-v4.8.1-2019-0228.tar.gz && \
+    tar xzf cd-hit-v4.8.1-2019-0228.tar.gz && \
+    rm -rf cd-hit-v4.8.1-2019-0228.tar.gz && \
+    cd cd-hit-v4.8.1-2019-0228 && \
+    make && \
+    make install && \
+    cd .. && \
+    rm -rf cd-hit-v4.8.1-2019-0228
+
+RUN wget https://mafft.cbrc.jp/alignment/software/mafft_7.450-1_amd64.deb && \
+    dpkg -i mafft_7.450-1_amd64.deb && \
+    rm -rf mafft_7.450-1_amd64.deb
+
+RUN mkdir /app
+WORKDIR /app
+
+COPY requirements.txt /app
+RUN python3 -m venv .venv && \
+    . .venv/bin/activate && \
+    pip install -r requirements.txt
+
+COPY . /app
+RUN cd UnitePSSMs && \
+    g++ *.cpp -std=c++11 -O3 -o UnitePSSMs && \
+    cd ../PSSM_score_Peptide && \
+    g++ *.cpp -std=c++11 -O3 -o PSSM_score_Peptide && \
+    cd ../hits_cpp && \
+    g++ *.cpp -std=c++11 -O3 -o hits && \
+    cd ../tfidf && \
+    g++ *.cpp -std=c++11 -O3 -o tfidf
+
+ENV APP_FILE IgOmeProfiling_pipeline.py
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["-h"]
diff --git a/DockerfileWorker b/DockerfileWorker
@@ -0,0 +1,3 @@
+FROM webiks/igome-profile:latest
+
+ENTRYPOINT ["./worker_entrypoint.sh"]
diff --git a/IgOmeProfiling_pipeline.py b/IgOmeProfiling_pipeline.py
@@ -3,17 +3,20 @@
 import sys
 if os.path.exists('/groups/pupko/orenavr2/'):
     src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
-else:
+elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
     src_dir = '/Users/Oren/Dropbox/Projects/gershoni/src'
+else:
+    src_dir = '.'
 sys.path.insert(0, src_dir)
 
 from auxiliaries.pipeline_auxiliaries import *
 
 def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondition_path, analysis_dir, logs_dir,
-                 left_construct, right_construct, max_mismatches_allowed, min_sequencing_quality, gz,
+                 left_construct, right_construct, max_mismatches_allowed, min_sequencing_quality, minimal_length_required, gz,
                  max_msas_per_sample, max_msas_per_bc,
                  max_number_of_cluster_members_per_sample, max_number_of_cluster_members_per_bc,
-                 allowed_gap_frequency, number_of_random_pssms,
+                 allowed_gap_frequency, concurrent_cutoffs, meme_split_size, use_mapitope, number_of_random_pssms,
+                 rank_method, tfidf_method, tfidf_factor, shuffles, shuffles_percent, shuffles_digits,
                  run_summary_path, error_path, queue, verbose, argv):
 
     os.makedirs(os.path.split(run_summary_path)[0], exist_ok=True)
@@ -39,8 +42,8 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
 
         module_parameters = [fastq_path, first_phase_output_path, first_phase_logs_path,
                              barcode2samplename_path, left_construct, right_construct,
-                             max_mismatches_allowed, min_sequencing_quality, first_phase_done_path,
-                             '--gz' if gz else '', f'--error_path {error_path}', '-v' if verbose else '']
+                             max_mismatches_allowed, min_sequencing_quality, minimal_length_required,first_phase_done_path,
+                             '--gz' if gz else '', f'--error_path {error_path}', '-v' if verbose else '', '-m' if use_mapitope else '']
         cmd = submit_pipeline_step(f'{src_dir}/reads_filtration/module_wraper.py',
                              [module_parameters],
                              logs_dir, f'{exp_name}_reads_filtration',
@@ -61,7 +64,10 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
                              samplename2biologicalcondition_path, max_msas_per_sample, max_msas_per_bc,
                              max_number_of_cluster_members_per_sample, max_number_of_cluster_members_per_bc,
                              allowed_gap_frequency, second_phase_done_path,
-                             f'--error_path {error_path}', '-v' if verbose else '', f'-q {queue}']
+                             f'--meme_split_size {meme_split_size}',
+                             f'--error_path {error_path}', '-v' if verbose else '', f'-q {queue}','-m' if use_mapitope else '']
+        if concurrent_cutoffs:
+            module_parameters.append('--concurrent_cutoffs')
         cmd = submit_pipeline_step(f'{src_dir}/motif_inference/module_wraper.py',
                              [module_parameters],
                              logs_dir, f'{exp_name}_motif_inference',
@@ -71,7 +77,7 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
                          error_file_path=error_path, suffix='motif_inference_done.txt')
     else:
         logger.info(f'{datetime.datetime.now()}: skipping motif inference. Done file exists at:\n{second_phase_done_path}')
-
+        
     third_phase_done_path = f'{logs_dir}/model_fitting_done.txt'
     if not os.path.exists(third_phase_done_path):
         os.makedirs(third_phase_output_path, exist_ok=True)
@@ -80,8 +86,16 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
 
         module_parameters = [first_phase_output_path, second_phase_output_path, third_phase_output_path,
                              third_phase_logs_path, samplename2biologicalcondition_path, number_of_random_pssms,
-                             third_phase_done_path, f'--error_path {error_path}', '-v' if verbose else '',
-                             f'-q {queue}']
+                             third_phase_done_path, f'--shuffles_percent {shuffles_percent}', f'--shuffles_digits {shuffles_digits}' ,f'--rank_method {rank_method}', f'--error_path {error_path}', 
+                             '-v' if verbose else '', f'-q {queue}','-m' if use_mapitope else '']
+        if rank_method == 'tfidf':
+            if tfidf_method:
+                module_parameters += ['--tfidf_method', tfidf_method]
+            if tfidf_factor:
+                module_parameters += ['--tfidf_factor', str(tfidf_factor)]
+        elif rank_method == 'shuffles':
+            if shuffles:
+                module_parameters += ['--shuffles', shuffles]
         cmd = submit_pipeline_step(f'{src_dir}/model_fitting/module_wraper.py',
                              [module_parameters],
                              logs_dir, f'{exp_name}_model_fitting',
@@ -123,6 +137,8 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
     parser.add_argument('--min_sequencing_quality', type=int, default=38,
                         help='Minimum average sequencing threshold allowed after filtration'
                              'for more details, see: https://en.wikipedia.org/wiki/Phred_quality_score')
+    parser.add_argument('--minimal_length_required', default=3, type=int,
+                        help='Shorter peptides will be discarded')                             
     parser.add_argument('--gz', action='store_true', help='gzip fastq, filtration_log, fna, and faa files')
 
     # optional parameters for the motif inference
@@ -138,9 +154,20 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
                         help='Maximal gap frequency allowed in msa (higher frequency columns are removed)',
                         type=lambda x: float(x) if 0 < float(x) < 1
                                                 else parser.error(f'The threshold of the maximal gap frequency allowed per column should be between 0 to 1'))
+    parser.add_argument('--concurrent_cutoffs', action='store_true',
+                        help='Use new method which splits meme before cutoffs and runs cutoffs concurrently')
+    parser.add_argument('--meme_split_size', type=int, default=1, # TODO default of 1, 5 or 10?
+                        help='Split size, how many meme per files for calculations')
+    parser.add_argument('-m', '--mapitope', action='store_true', help='use mapitope encoding')
 
     # optional parameters for the modelling step
     parser.add_argument('--number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
+    parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
+    parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
+    parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
+    parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
+    parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
+    parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
 
     # general optional parameters
     parser.add_argument('--run_summary_path', type=str,
@@ -161,11 +188,13 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
     run_summary_path = args.error_path if args.error_path else os.path.join(args.analysis_dir, 'run_summary_path.txt')
     error_path = args.error_path if args.error_path else os.path.join(args.logs_dir, 'error.txt')
 
+    concurrent_cutoffs = True if args.concurrent_cutoffs else False
+
     run_pipeline(args.fastq_path, args.barcode2samplename_path, args.samplename2biologicalcondition_path,
                  args.analysis_dir.rstrip('/'), args.logs_dir.rstrip('/'),
-                 args.left_construct, args.right_construct, args.max_mismatches_allowed, args.min_sequencing_quality, True if args.gz else False,
+                 args.left_construct, args.right_construct, args.max_mismatches_allowed, args.min_sequencing_quality, args.minimal_length_required, True if args.gz else False,
                  args.max_msas_per_sample, args.max_msas_per_bc,
                  args.max_number_of_cluster_members_per_sample, args.max_number_of_cluster_members_per_bc,
-                 args.allowed_gap_frequency, args.number_of_random_pssms,
+                 args.allowed_gap_frequency, concurrent_cutoffs, args.meme_split_size, args.mapitope, args.number_of_random_pssms,
+                 args.rank_method, args.tfidf_method, args.tfidf_factor, args.shuffles, args.shuffles_percent, args.shuffles_digits,
                  run_summary_path, error_path, args.queue, True if args.verbose else False, sys.argv)
-
diff --git a/PSSM_score_Peptide/PSSM.cpp b/PSSM_score_Peptide/PSSM.cpp
@@ -2,6 +2,7 @@
 #include "SEQ.h"
 
 #include <algorithm>
+#include <random>
 using namespace std;
 
 void PSSM::setMatrix(const vector<string> & PSSMLines)
@@ -179,8 +180,8 @@ double PSSM::computeScoreExacrPos(size_t posInPSSM, const size_t charInSeq) cons
 }// end of function
 */
 
-PSSM PSSM::randomize() {
+PSSM PSSM::randomize(default_random_engine &gen) {
 	PSSM res(*this);
-	random_shuffle(res.PSSMmatrix.begin(), res.PSSMmatrix.end());
+	shuffle(res.PSSMmatrix.begin(), res.PSSMmatrix.end(), gen);
 	return res;
 }
diff --git a/PSSM_score_Peptide/PSSM.h b/PSSM_score_Peptide/PSSM.h
@@ -9,6 +9,7 @@
 #include <algorithm>    // std::min_element, std::max_element
 #include <iostream>
 #include <cmath>  // because of unix
+#include <random>
 
 using namespace std;
 #include "alphabet.h"
@@ -43,7 +44,7 @@ class PSSM {
 		return PSSMmatrix.size();
 
 	}
-	PSSM randomize();
+	PSSM randomize(default_random_engine &gen);
 	//int integerOfChar(const char s) const;
 
 	//members:

diff --git a/PSSM_score_Peptide/computePSSM_cutoffs.cpp b/PSSM_score_Peptide/computePSSM_cutoffs.cpp
@@ -9,7 +9,13 @@ void PSSM_scoresFromSeqVector(const PSSM& PSSM1, const vector<vector <size_t>> &
 computePSSM_cutoffs::computePSSM_cutoffs(vector<PSSM> & PSSM_array,
 	size_t TotalNumberOfRandoSeq,
 	alphabet& alph,
-	const string & CutofsPerPSSM_FileName) : _PSSM_array(PSSM_array), _totalNumberOfRandoSeq(TotalNumberOfRandoSeq), _alph(alph), _CutofsPerPSSM_FileName(CutofsPerPSSM_FileName)
+	const string & CutofsPerPSSM_FileName,
+	int totalMemes) : 
+	_PSSM_array(PSSM_array), 
+	_totalNumberOfRandoSeq(TotalNumberOfRandoSeq), 
+	_alph(alph), 
+	_CutofsPerPSSM_FileName(CutofsPerPSSM_FileName),
+	_totalMemes(totalMemes)
 {
 	generateRandomPeptides();
 	computecCutoffsBasedOnRandomPeptides();
@@ -23,6 +29,7 @@ string SizetToString(size_t sz) {
 
 void computePSSM_cutoffs::generateRandomPeptides() {
 	size_t NumberOfRandoSeq = _totalNumberOfRandoSeq;  
+	srand(931); // Set srand for generating random pepties // TODO set srand from input argument
 	map<string, randomPeptides>::iterator it = _randomPeptideDataSet.begin(); // use iteration and insert to add values to map
 	for (size_t length = 5; length <= 12; length++)
 	{
@@ -73,7 +80,7 @@ void computePSSM_cutoffs::computecCutoffsBasedOnRandomPeptides() {
 	ofstream PSSM_Scores_Cutoff;
 	PSSM_Scores_Cutoff.open(_CutofsPerPSSM_FileName);
 
-	double PercentOfAcceptedPeptidesPerType = PercentOfRandomHitsPerPSSM / _PSSM_array.size(); // for each seq type the cutoff will be the percent of hits accepted for all PSSMs devided by the number of PSSMs
+	double PercentOfAcceptedPeptidesPerType = PercentOfRandomHitsPerPSSM / _totalMemes; // for each seq type the cutoff will be the percent of hits accepted for all PSSMs devided by the number of PSSMs
 	for (size_t i = 0; i<_PSSM_array.size(); ++i)
 	{
 		PSSM_Scores_Cutoff << "###\t" << _PSSM_array[i].PSSM_name << "\t";

diff --git a/PSSM_score_Peptide/computePSSM_cutoffs.h b/PSSM_score_Peptide/computePSSM_cutoffs.h
@@ -14,10 +14,11 @@ using namespace std;
 
 class computePSSM_cutoffs{
 public:
-	computePSSM_cutoffs(	vector<PSSM> & PSSM_array,
-									size_t TotalNumberOfRandoSeq,
-									alphabet & alph,
-									const string & CutofsPerPSSM_FileName);
+	computePSSM_cutoffs(vector<PSSM> & PSSM_array,
+						size_t TotalNumberOfRandoSeq,
+						alphabet & alph,
+						const string & CutofsPerPSSM_FileName,
+						int totalMemes);
 
 private:
 	void generateRandomPeptides();
@@ -32,6 +33,7 @@ class computePSSM_cutoffs{
 	string const & _CutofsPerPSSM_FileName;
 	const double PercentOfRandomHitsPerPSSM = 0.05;
 	alphabet& _alph;
+	int _totalMemes;
 
 
 

diff --git a/PSSM_score_Peptide/main.cpp b/PSSM_score_Peptide/main.cpp
@@ -6,6 +6,7 @@
 #include <limits>
 #include <cfloat> // eli because of unix
 #include <cmath>  // because of unix
+#include <random>
 using namespace std;
 
 #include "PSSM.h"
@@ -67,7 +68,7 @@ size_t get_running_mode(int argc, char *argv[]){
 	int min_required_params = 2;
 	if ((argc > (max_required_params * 2) + 2) || (argc < (min_required_params * 2) + 2)) {// each with its flag and mode flag, check the value of argc. If not enough parameters have been passed, inform user and exit.
 		cout << "Usage is in one of few modes: <<endl";
-		cout << "[1: CalcPSSM_Cutoff] " << argv[0] << " -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs> -CalcPSSM_Cutoff" << endl; // Inform the user of how to use the program
+		cout << "[1: CalcPSSM_Cutoff] " << argv[0] << " -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs> -CalcPSSM_Cutoff -total_memes <total memes, used if input is splitted otherwise 0>" << endl; // Inform the user of how to use the program
 		cout << "[2: CalcPSSM_Pval] "<<argv[0]<<" -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs> -seq <input_seq_FASTA> -out <out> -NrandPSSM <number_of_random_PSSMs> -CalcPSSM_Pval" << endl; // Inform the user of how to use the program
 		cout << "[3: CalcPSSM_Hits] " << argv[0] << " -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs> -seq <input_seq_FASTA> -out <out> -CalcPSSM_Hits " << endl; // Inform the user of how to use the program
 		cout << "\n\nThe number of provided arguments is "<<argc<<endl;
@@ -95,11 +96,11 @@ size_t get_running_mode(int argc, char *argv[]){
 	return mode;
 }
 
-void getFileNamesFromArgv(int argc, char *argv[], string & PSSM_FileName, string & CutofsPerPSSM_FileName) {
+void getFileNamesFromArgv(int argc, char *argv[], string & PSSM_FileName, string & CutofsPerPSSM_FileName, int & totalMemes) {
 	// parse ARGV arguments
-	size_t num_required_params = 2;
+	size_t num_required_params = 3;
 	if (argc != (num_required_params * 2)+2) {// each with its flag and mode_flag, check the value of argc. If not enough parameters have been passed, inform user and exit.
-		cout << "Usage is -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs>\n"; // Inform the user of how to use the program
+		cout << "Usage is -pssm <PSSMs_in_MAST_Format> -pssm_cutoffs <filename_for PSSM_cutoffs> -total_memes <total memes, used if input is splitted otherwise 0>\n"; // Inform the user of how to use the program
 		exit(17);
 	}
 	cout << argv[0] <<" ";
@@ -110,6 +111,7 @@ void getFileNamesFromArgv(int argc, char *argv[], string & PSSM_FileName, string
 		* name of the program, which is stored in argv[0] */
 		if (string(argv[i]) == "-pssm") PSSM_FileName = string(argv[i + 1]);
 		else if (string(argv[i]) == "-pssm_cutoffs") CutofsPerPSSM_FileName = string(argv[i + 1]);
+		else if (string(argv[i]) == "-total_memes") totalMemes = stoi(string(argv[i + 1]));
 	}
 }
 
@@ -165,9 +167,12 @@ int computeCutoffsOfPssmMain(int argc, char *argv[])
 	size_t TotalNumberOfRandoSeq=100000;
 	string PSSM_FileName = "";
 	string CutofsPerPSSM_FileName = "";
-	getFileNamesFromArgv(argc,argv,PSSM_FileName, CutofsPerPSSM_FileName);
+	int totalMemes = 0;
+	getFileNamesFromArgv(argc,argv,PSSM_FileName, CutofsPerPSSM_FileName, totalMemes);
 	readPSSM_info_from_file rpif(PSSM_FileName);
-	computePSSM_cutoffs cpc1(rpif._PSSM_array, TotalNumberOfRandoSeq, rpif._alph, CutofsPerPSSM_FileName);
+	if (totalMemes == 0) 
+		totalMemes = rpif._PSSM_array.size();
+	computePSSM_cutoffs cpc1(rpif._PSSM_array, TotalNumberOfRandoSeq, rpif._alph, CutofsPerPSSM_FileName, totalMemes);
 	return 0;
 }
 
@@ -461,8 +466,9 @@ int assignPvalueToPSSMaRRAY(int argc, char *argv[])
 	//for (size_t i = 0; i < 1; ++i) {
 		double numberOfHitsInRealPSSM = numberOfTotalHitsPerPSSM(rpif._PSSM_array[i], Seq_array,1);
 		vector<double> numSigPeptides;
+		default_random_engine gen(483); // TODO seed should be fro input
 		for (size_t j = 0; j < numberOfRandomPSSM; ++j) {
-			PSSM randomPSSM = rpif._PSSM_array[i].randomize(); //1 generate a random PPSM.
+			PSSM randomPSSM = rpif._PSSM_array[i].randomize(gen); //1 generate a random PPSM.
 			double sum = numberOfTotalHitsPerPSSM(randomPSSM, Seq_array,0);
 			numSigPeptides.push_back(sum);// store the number
 		}

diff --git a/PSSM_score_Peptide/randomPeptides.cpp b/PSSM_score_Peptide/randomPeptides.cpp
@@ -10,13 +10,16 @@
 using namespace std;
 
 #include "randomPeptides.h"
-randomPeptides::randomPeptides(vector<double> &characterFrequencies, size_t numberOfSeqToSimulate, size_t sequenceLength) : _characterFrequencies(characterFrequencies), _numberOfSeqToSimulate(numberOfSeqToSimulate), _sequenceLength(sequenceLength),_CysLoop (0) { // deafualt constroctor - NoCysLoop
-     srand(static_cast<unsigned int>(time(NULL)));
+randomPeptides::randomPeptides(vector<double> &characterFrequencies, size_t numberOfSeqToSimulate, size_t sequenceLength) : 
+	_characterFrequencies(characterFrequencies), _numberOfSeqToSimulate(numberOfSeqToSimulate), _sequenceLength(sequenceLength),_CysLoop (0) { // deafualt constroctor - NoCysLoop
+     // srand(static_cast<unsigned int>(time(NULL))); // srand is set using seed at a higher level
 }
 
-randomPeptides::randomPeptides(vector<double> &characterFrequencies, size_t numberOfSeqToSimulate, size_t sequenceLength,bool CysLoop) : _characterFrequencies(characterFrequencies), _numberOfSeqToSimulate(numberOfSeqToSimulate), _sequenceLength(sequenceLength),_CysLoop (CysLoop) {
-     srand(static_cast<unsigned int>(time(NULL)));
+randomPeptides::randomPeptides(vector<double> &characterFrequencies, size_t numberOfSeqToSimulate, size_t sequenceLength,bool CysLoop) :
+	_characterFrequencies(characterFrequencies), _numberOfSeqToSimulate(numberOfSeqToSimulate), _sequenceLength(sequenceLength),_CysLoop (CysLoop) {
+     // srand(static_cast<unsigned int>(time(NULL))); // srand is set using seed at a higher level
 }	
+
 void randomPeptides::generateRandomSequences() {
 //	std::mt19937 Seq_eng;  // a core engine class
 //	std::random_device dev_random;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM webiks/igome-profile:latest

		ENTRYPOINT ["./worker_entrypoint.sh"]