From 0ca6c410fee152da0bccacdac1e2e9dba80d1e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Tue, 13 Jan 2015 09:40:22 -0500 Subject: [PATCH 01/23] Surveyor: add filter options to build similarity and distance matrices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Mock/Parameters.cpp | 10 +++ code/Surveyor/CoalescenceManager.cpp | 6 +- code/Surveyor/CoalescenceManager.h | 4 +- code/Surveyor/GenomeAssemblyReader.cpp | 2 +- code/Surveyor/GenomeGraphReader.cpp | 2 +- code/Surveyor/Mother.cpp | 29 +++++-- code/Surveyor/StoreKeeper.cpp | 100 +++++++++++++++++++------ code/Surveyor/StoreKeeper.h | 2 + 8 files changed, 119 insertions(+), 36 deletions(-) diff --git a/code/Mock/Parameters.cpp b/code/Mock/Parameters.cpp index b970b629..0fecbc02 100644 --- a/code/Mock/Parameters.cpp +++ b/code/Mock/Parameters.cpp @@ -1617,6 +1617,16 @@ void Parameters::showUsage(){ showOption("-read-sample-assembly SampleName SampleAssemblyFile", "Reads an assembly (a fasta file)"); cout< fastTable; fastTable["-read-sample-graph"] = INPUT_TYPE_GRAPH; + fastTable["-filter-in-graph"] = INPUT_FILTERIN_GRAPH; + fastTable["-filter-out-graph"] = INPUT_FILTEROUT_GRAPH; fastTable["-read-sample-assembly"] = INPUT_TYPE_ASSEMBLY; + fastTable["-filter-in-assembly"] = INPUT_FILTERIN_ASSEMBLY; + fastTable["-filter-out-assembly"] = INPUT_FILTEROUT_ASSEMBLY; // Unsupported option if(fastTable.count(element) == 0 || i+2 > (int) commands->size()) @@ -495,13 +503,23 @@ void Mother::startSurveyor() { send(m_coalescenceManager, dummyMessage); + // send kmerLength and sampleInputTypes to localStore int kmerLength = m_parameters->getWordSize(); + vector * sampleInputTypes = & m_sampleInputTypes; + + char buffer[32]; + int offset = 0; + memcpy(buffer + offset, &kmerLength, sizeof(kmerLength)); + offset += sizeof(kmerLength); + memcpy(buffer + offset, &sampleInputTypes, sizeof(sampleInputTypes)); + offset += sizeof(sampleInputTypes); - // send the kmer to local store Message kmerMessage; kmerMessage.setBuffer(&kmerLength); kmerMessage.setNumberOfBytes(sizeof(kmerLength)); - kmerMessage.setTag(CoalescenceManager::SET_KMER_LENGTH); + kmerMessage.setBuffer(&buffer); + kmerMessage.setNumberOfBytes(sizeof(buffer)); + kmerMessage.setTag(CoalescenceManager::SET_KMER_INFO); send(localStore, kmerMessage); } @@ -535,13 +553,12 @@ void Mother::spawnReader() { int type = m_sampleInputTypes[sampleIdentifier]; m_fileIterator++; - if(type == INPUT_TYPE_GRAPH){ + if(type < 3){ GenomeGraphReader * actor = new GenomeGraphReader(); spawn(actor); actor->setFileName(fileName, sampleIdentifier); - int coalescenceManagerName = m_coalescenceManager; int destination = actor->getName(); Message dummyMessage; @@ -555,7 +572,7 @@ void Mother::spawnReader() { send(destination, dummyMessage); - } else if(type == INPUT_TYPE_ASSEMBLY) { + } else if(type > 2) { GenomeAssemblyReader * actor = new GenomeAssemblyReader(); spawn(actor); diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 8e893fa5..cd6ca69f 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -91,11 +91,12 @@ void StoreKeeper::receive(Message & message) { } else if(tag == MERGE_GRAM_MATRIX) { - - // printName(); - // cout << "DEBUG at MERGE_GRAM_MATRIX message reception "; - // cout << "(StoreKeeper) received " << m_receivedObjects << " objects in total"; - // cout << " with " << m_receivedPushes << " push operations" << endl; +#if 0 + printName(); + cout << "DEBUG at MERGE_GRAM_MATRIX message reception "; + cout << "(StoreKeeper) received " << m_receivedObjects << " objects in total"; + cout << " with " << m_receivedPushes << " push operations" << endl; +#endif computeLocalGramMatrix(); @@ -103,11 +104,10 @@ void StoreKeeper::receive(Message & message) { memcpy(&m_matrixOwner, buffer, sizeof(m_matrixOwner)); - /* +/* printName(); cout << "DEBUG m_matrixOwner " << m_matrixOwner << endl; */ - m_iterator1 = m_localGramMatrix.begin(); if(m_iterator1 != m_localGramMatrix.end()) { @@ -132,13 +132,17 @@ void StoreKeeper::receive(Message & message) { } else if(tag == KmerMatrixOwner::PUSH_KMER_SAMPLES_OK) { sendKmersSamples(); - } else if(tag == CoalescenceManager::SET_KMER_LENGTH) { + } else if(tag == CoalescenceManager::SET_KMER_INFO) { int kmerLength = 0; int position = 0; + char * buffer = (char*)message.getBufferBytes(); memcpy(&kmerLength, buffer + position, sizeof(kmerLength)); position += sizeof(kmerLength); + memcpy(&m_sampleInputTypes, buffer + position, sizeof(m_sampleInputTypes)); + position += sizeof(m_sampleInputTypes); + if(m_kmerLength == 0) m_kmerLength = kmerLength; @@ -152,9 +156,18 @@ void StoreKeeper::receive(Message & message) { m_colorSpaceMode = false; #if 0 - cout << "DEBUG StoreKeeper SET_KMER_LENGTH "; - cout << m_kmerLength; + std::ostringstream ss; + for(std::vector::iterator it = m_sampleInputTypes->begin(); it != m_sampleInputTypes->end(); ++it) { + ss << *it; + } + + cout << "DEBUG StoreKeeper SET_KMER_INFO "; + cout << m_kmerLength << endl; + + cout << "DEBUG StoreKeeper list of filters :"; + cout << ss.str(); cout << endl; + #endif } @@ -281,10 +294,7 @@ void StoreKeeper::computeLocalGramMatrix() { // This directed survey only aims at counting colored kmers with colors // other than sample colors - bool useFirstColorToFilter = false; - - int filterColor = 0; - bool hasFilter = samples->count(filterColor) > 0; + bool filterOutKmer = checkKmerFilter(samples); // since people are going to use this to check // for genome size, don't duplicate counts @@ -342,20 +352,14 @@ void StoreKeeper::computeLocalGramMatrix() { //if(sample2 < sample1) // this is a diagonal matrix - if(useFirstColorToFilter && !hasFilter) { + if(filterOutKmer) continue; - } m_localGramMatrix[sample1Index][sample2Index] += hits; - /* - cout << "DEBUG count entry "; - cout << "[ " << sample1Index << " "; - cout << sample2Index << " "; - cout << - */ - +#if 0 sum += hits; +#endif //m_localGramMatrix[sample2Index][sample1Index] += hits; } } @@ -674,3 +678,53 @@ void StoreKeeper::sendKmersSamples() { send(m_kmerMatrixOwner, message); } + + + +bool StoreKeeper::checkKmerFilter (set * samples) { + +// #define INPUT_TYPE_GRAPH 0 +// #define INPUT_FILTERIN_GRAPH 1 +// #define INPUT_FILTEROUT_GRAPH 2 +// #define INPUT_TYPE_ASSEMBLY 3 +// #define INPUT_FILTERIN_ASSEMBLY 4 +// #define INPUT_FILTEROUT_ASSEMBLY 5 + + bool skipKmer = false; + vector samplesFILTERIN; + + for(std::vector::iterator it = m_sampleInputTypes->begin(); it != m_sampleInputTypes->end(); ++it) { + + int sample = distance(m_sampleInputTypes->begin(),it); + + if(*it == 0 || *it == 3){ + continue; + } + else if(*it == 1 || *it == 4){ + // Store the FILTERIN for later lookup + samplesFILTERIN.push_back(sample); + } + else if(*it == 2 || *it == 5){ + // Skip the Kmer if part of a FILTEROUT + if (samples->count(sample) > 0){ + return true; + } + } + + } + + // Look if the Kmer is not part of a FILTERIN + // Only incorporates kmers from FILTERIN samples if not already Filtered OUT + if(!samplesFILTERIN.empty()) { + skipKmer = true; + for(std::vector::iterator it = samplesFILTERIN.begin(); it != samplesFILTERIN.end(); ++it) { + if (samples->count(*it) > 0){ + return false; + } + } + } + + + return skipKmer; + +} diff --git a/code/Surveyor/StoreKeeper.h b/code/Surveyor/StoreKeeper.h index fcedd69a..1381704e 100644 --- a/code/Surveyor/StoreKeeper.h +++ b/code/Surveyor/StoreKeeper.h @@ -60,6 +60,7 @@ class StoreKeeper: public Actor { int m_mother; int m_matrixOwner; int m_kmerMatrixOwner; + vector * m_sampleInputTypes; bool m_configured; @@ -92,6 +93,7 @@ class StoreKeeper: public Actor { void sendMatrixCell(); + bool checkKmerFilter(set * samples); public: StoreKeeper(); From 324c512100e5063254cb70df4eb87bd2b489ed58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Thu, 15 Jan 2015 11:53:28 -0500 Subject: [PATCH 02/23] Surveyor: change type verification with defined variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/Mother.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index 6ba25e8d..19b43f2c 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -553,7 +553,7 @@ void Mother::spawnReader() { int type = m_sampleInputTypes[sampleIdentifier]; m_fileIterator++; - if(type < 3){ + if(type == INPUT_TYPE_GRAPH || type == INPUT_FILTERIN_GRAPH || type == INPUT_FILTEROUT_GRAPH) { GenomeGraphReader * actor = new GenomeGraphReader(); spawn(actor); @@ -572,7 +572,7 @@ void Mother::spawnReader() { send(destination, dummyMessage); - } else if(type > 2) { + } else if(type == INPUT_TYPE_ASSEMBLY || type == INPUT_FILTERIN_ASSEMBLY || type == INPUT_FILTEROUT_ASSEMBLY){ GenomeAssemblyReader * actor = new GenomeAssemblyReader(); spawn(actor); From 6540f7d3d38cd13d8f43fe6c1550c2d3da741b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Wed, 11 Mar 2015 15:09:47 -0400 Subject: [PATCH 03/23] fix unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/StoreKeeper.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index cd6ca69f..5f55b69d 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -260,7 +260,9 @@ void StoreKeeper::printColorReport() { void StoreKeeper::computeLocalGramMatrix() { +#if 0 uint64_t sum = 0; +#endif // compute the local Gram matrix From 7bd03fb1baa734bd1383f3a05acaadc10b2bc146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Tue, 17 Mar 2015 17:40:41 -0400 Subject: [PATCH 04/23] Surveyor: storekeeper now use define values instead of int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/StoreKeeper.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 5f55b69d..5b72311e 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -38,6 +38,15 @@ using namespace std; #include + +#define INPUT_TYPE_GRAPH 0 +#define INPUT_FILTERIN_GRAPH 1 +#define INPUT_FILTEROUT_GRAPH 2 +#define INPUT_TYPE_ASSEMBLY 3 +#define INPUT_FILTERIN_ASSEMBLY 4 +#define INPUT_FILTEROUT_ASSEMBLY 5 + + StoreKeeper::StoreKeeper() { m_storeDataCalls = 0; @@ -685,12 +694,6 @@ void StoreKeeper::sendKmersSamples() { bool StoreKeeper::checkKmerFilter (set * samples) { -// #define INPUT_TYPE_GRAPH 0 -// #define INPUT_FILTERIN_GRAPH 1 -// #define INPUT_FILTEROUT_GRAPH 2 -// #define INPUT_TYPE_ASSEMBLY 3 -// #define INPUT_FILTERIN_ASSEMBLY 4 -// #define INPUT_FILTEROUT_ASSEMBLY 5 bool skipKmer = false; vector samplesFILTERIN; @@ -699,14 +702,14 @@ bool StoreKeeper::checkKmerFilter (set * samples) { int sample = distance(m_sampleInputTypes->begin(),it); - if(*it == 0 || *it == 3){ + if(*it == INPUT_TYPE_GRAPH || *it == INPUT_TYPE_ASSEMBLY){ continue; } - else if(*it == 1 || *it == 4){ + else if(*it == INPUT_FILTERIN_GRAPH || *it == INPUT_FILTERIN_ASSEMBLY){ // Store the FILTERIN for later lookup samplesFILTERIN.push_back(sample); } - else if(*it == 2 || *it == 5){ + else if(*it == INPUT_FILTEROUT_GRAPH || *it == INPUT_FILTEROUT_ASSEMBLY){ // Skip the Kmer if part of a FILTEROUT if (samples->count(sample) > 0){ return true; From 28b0d473027745b48f3f93f5558518df7d876594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Tue, 7 Apr 2015 15:59:53 -0400 Subject: [PATCH 05/23] edit StoreKeeper.cpp physical from virtual colors loop now in N*LogN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/StoreKeeper.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 5b72311e..c583fd33 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -347,14 +347,15 @@ void StoreKeeper::computeLocalGramMatrix() { // TODO: samples could be ordered and stored in a vector // to stop as soon as j > i - // Complexity: quadratic in the number of samples. + // Complexity: quadratic in the number of samples -> physical colors of the virtual color. + // .. Now N*LogN in the number of samples VS quadratic for(set::iterator sample1 = samples->begin(); sample1 != samples->end(); ++sample1) { SampleIdentifier sample1Index = *sample1; - for(set::iterator sample2 = samples->begin(); + for(set::iterator sample2 = sample1; sample2 != samples->end(); ++sample2) { @@ -368,6 +369,9 @@ void StoreKeeper::computeLocalGramMatrix() { m_localGramMatrix[sample1Index][sample2Index] += hits; + if (sample1Index != sample2Index) + m_localGramMatrix[sample2Index][sample1Index] += hits; + #if 0 sum += hits; #endif From 62131e34f3592843cce5d0002c3028c747c807cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 13 Apr 2015 17:48:07 -0400 Subject: [PATCH 06/23] edit comments in Mother MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/Mother.cpp | 123 +++++++-------------------------------- code/Surveyor/Mother.h | 22 ++++--- 2 files changed, 30 insertions(+), 115 deletions(-) diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index 19b43f2c..c03bcf9b 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -70,28 +70,19 @@ void Mother::receive(Message & message) { char * buffer = message.getBufferBytes(); if(tag == Actor::BOOT) { - boot(message); } else if (tag == Mother::HELLO) { hello(message); - } else if(tag == GenomeGraphReader::START_PARTY_OK) { // spawn the next reader now ! - - /* - printName(); - cout << "DEBUG spawnReader because START_PARTY_OK" << endl; -*/ spawnReader(); } else if(tag == GenomeGraphReader::DONE) { m_aliveReaders--; - if(m_aliveReaders == 0) { - notifyController(); } @@ -155,6 +146,8 @@ void Mother::receive(Message & message) { // is caused by the fact that this message is not // received . + // do nothing + } else if(tag == FINISH_JOB) { m_finishedMothers++; @@ -163,28 +156,14 @@ void Mother::receive(Message & message) { // all readers have finished, // now tell mother to flush aggregators - - /* - printName(); - cout << "DEBUG m_finishedMothers: " << m_finishedMothers << " "; - cout << " starting pair FLUSH_AGGREGATOR, FLUSH_AGGREGATOR_RETURN"; - cout << endl; - */ - sendToFirstMother(FLUSH_AGGREGATOR, FLUSH_AGGREGATOR_RETURN); } } else if(tag == FLUSH_AGGREGATOR) { - /* - printName(); - cout << "DEBUG received FLUSH_AGGREGATOR" << endl; - */ - m_bigMother = source; // forward the message to the aggregator - Message newMessage; newMessage.setTag(CoalescenceManager::FLUSH_BUFFERS); send(m_coalescenceManager, newMessage); @@ -195,20 +174,12 @@ void Mother::receive(Message & message) { } else if(tag == CoalescenceManager::FLUSH_BUFFERS_OK) { - /* - printName(); - cout << "DEBUG CoalescenceManager sent FLUSH_BUFFERS_OK to mother." << endl; - */ - + // notice bigMother that the CoalescenceManager flushed its buffers + // .. for this rank's mother Message response; response.setTag(FLUSH_AGGREGATOR_OK); send(m_bigMother, response); - /* - printName(); - cout << "DEBUG sending FLUSH_AGGREGATOR_OK to m_bigMother" << endl; - */ - } else if(tag == MatrixOwner::GRAM_MATRIX_IS_READY) { if(m_matricesAreReady){ @@ -232,8 +203,6 @@ void Mother::receive(Message & message) { } else if(tag == FLUSH_AGGREGATOR_OK) { - // printName(); - m_flushedMothers++; if(m_flushedMothers < getSize()) @@ -244,22 +213,19 @@ void Mother::receive(Message & message) { } else if(tag == m_responseTag) { if(m_responseTag == SHUTDOWN_OK) { - + // do nothing } else if(m_responseTag == MERGE_GRAM_MATRIX_OK) { + // All mothers merged their GRAM MATRIX // Spawn KmerMatrixOwner to print if(m_motherToKill < getSize() && m_printKmerMatrix){ spawnKmerMatrixOwner(); } - } else if(m_responseTag == MERGE_KMER_MATRIX_OK) { + } else if(m_responseTag == MERGE_KMER_MATRIX_OK) { + // do nothing } else if(m_responseTag == FLUSH_AGGREGATOR_RETURN) { - - /* - printName(); - cout << "DEBUG FLUSH_AGGREGATOR_RETURN received "; - cout << "m_motherToKill " << m_motherToKill << endl; - */ + // do nothing } // every mother was not informed. @@ -267,7 +233,6 @@ void Mother::receive(Message & message) { sendMessageWithReply(m_motherToKill, m_forwardTag); m_motherToKill--; } - } } @@ -280,13 +245,10 @@ void Mother::sendToFirstMother(int forwardTag, int responseTag) { sendMessageWithReply(m_motherToKill, m_forwardTag); m_motherToKill--; + } void Mother::sendMessageWithReply(int & actor, int tag) { -/* - printName(); - cout << "kills Mother " << actor << endl; - */ Message message; message.setTag(tag); @@ -298,25 +260,20 @@ void Mother::sendMessageWithReply(int & actor, int tag) { message.setBuffer(&m_kmerMatrixOwner); message.setNumberOfBytes(sizeof(m_kmerMatrixOwner)); } else if(tag == FLUSH_AGGREGATOR) { - - /* - printName(); - cout << " DEBUG sending message FLUSH_AGGREGATOR" << endl; - */ + // do nothing } send(actor, message); + } void Mother::notifyController() { + Message message2; message2.setTag(FINISH_JOB); // first Mother int controller = getSize(); - - printName(); - cout << "Mother notifies controller " << controller << endl; send(controller, message2); } @@ -347,55 +304,20 @@ void Mother::stop() { } void Mother::hello(Message & message) { - /* - printName(); - cout << "received HELLO from "; - cout << message.getSourceActor(); - cout << " bytes: " << message.getNumberOfBytes(); - cout << " content: " << *((int*) message.getBufferBytes()); - */ - - //char * buffer = (char*) message.getBufferBytes(); - /* - uint32_t checksum = computeCyclicRedundancyCode32((uint8_t*) message.getBufferBytes(), - message.getNumberOfBytes()); - */ - //cout << "DEBUG CRC32= " << checksum << endl; - - //cout << endl; + // dump for testing purposes } void Mother::boot(Message & message) { m_aliveReaders = 0; - /* - printName(); - cout << "Mother is booting and says hello" << endl; -*/ Message message2; - /* - char joe[4000]; - - int i = 4000; - char value = 0; - while(i) - joe[i--]=value++; -*/ int joe = 9921; message2.setBuffer(&joe); - //message2.setNumberOfBytes(4000); - message2.setNumberOfBytes( sizeof(int) * 1 ); - //cout << "DEBUG sending " << joe << endl; - - /* - uint32_t checksum = computeCyclicRedundancyCode32((uint8_t*) message2.getBufferBytes(), - message2.getNumberOfBytes() ); - cout << "DEBUG CRC32= " << checksum << endl; -*/ + message2.setNumberOfBytes( sizeof(int) * 1 ); message2.setTag(Mother::HELLO); @@ -479,8 +401,7 @@ void Mother::startSurveyor() { m_coalescenceManager = coalescenceManager->getName(); - // spawn the local store keeper and introduce the CoalescenceManager - // to the StoreKeeper + // spawn the local StoreKeeper and introduce the CoalescenceManager to him // spawn actors for storing the graph. for(int i = 0 ; i < PLAN_STORE_KEEPER_ACTORS_PER_RANK; ++i) { @@ -525,7 +446,6 @@ void Mother::startSurveyor() { // spawn an actor for each file that this actor owns - for(int i = 0; i < (int) m_inputFileNames.size() ; ++i) { int mother = getName() % getSize(); @@ -596,7 +516,6 @@ void Mother::spawnReader() { } if(m_aliveReaders == 0) { - notifyController(); } } @@ -613,9 +532,8 @@ void Mother::spawnMatrixOwner() { printName(); cout << "Spawned MatrixOwner actor !" << m_matrixOwner << endl; - // tell the StoreKeeper actors to send their stuff to the - // MatrixOwner actor - // The Mother of Mother will wait for a signal from MatrixOwner + // tell the StoreKeeper actors to send their stuff to the MatrixOwner actor + // bigMother will wait for a signal from MatrixOwner Message greetingMessage; @@ -648,9 +566,8 @@ void Mother::spawnKmerMatrixOwner() { printName(); cout << "Spawned KmerMatrixOwner actor !" << m_kmerMatrixOwner << endl; - // tell the StoreKeeper actors to send their stuff to the - // KmerMatrixOwner actor - // The Mother of Mother will wait for a signal from MatrixOwner + // tell the StoreKeeper actors to send their stuff to the KmerMatrixOwner actor + // bigMother will wait for a signal from KmerMatrixOwner Message greetingMessage; diff --git a/code/Surveyor/Mother.h b/code/Surveyor/Mother.h index f078fb0b..856adae8 100644 --- a/code/Surveyor/Mother.h +++ b/code/Surveyor/Mother.h @@ -56,12 +56,12 @@ class Mother: public Actor { private: int m_matrixOwner; - int m_kmerMatrixOwner; + int m_kmerMatrixOwner; int m_flushedMothers; int m_finishedMothers; - bool m_matricesAreReady; - bool m_printKmerMatrix; + bool m_matricesAreReady; + bool m_printKmerMatrix; Parameters * m_parameters; @@ -75,7 +75,7 @@ class Mother: public Actor { vector m_sampleNames; vector m_inputFileNames; - vector m_sampleInputTypes; + vector m_sampleInputTypes; int m_bigMother; int m_aliveReaders; @@ -84,7 +84,8 @@ class Mother: public Actor { int m_forwardTag; int m_responseTag; - void spawnReader(); + + // Mother methods void startSurveyor(); void hello(Message & message); void boot(Message & message); @@ -97,12 +98,10 @@ class Mother: public Actor { */ void sendToFirstMother(int forwardTag, int responseTag); - /* int m_kmerMatrixBlocNumber; */ - /* void printLocalKmersMatrix(string & kmer, string & samples_kmers, bool force); */ - /* void createKmersMatrixOutputFile(); */ - - void spawnMatrixOwner(); - void spawnKmerMatrixOwner(); + // Actor spawning + void spawnReader(); + void spawnMatrixOwner(); + void spawnKmerMatrixOwner(); public: @@ -131,4 +130,3 @@ class Mother: public Actor { }; #endif - From ef1d529eb2211ac192d15bd7e6674f92ad8e452e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 13 Apr 2015 20:11:08 -0400 Subject: [PATCH 07/23] edit typo in Mother.cpp comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/Mother.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index c03bcf9b..fef835e6 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -146,7 +146,7 @@ void Mother::receive(Message & message) { // is caused by the fact that this message is not // received . - // do nothing + // does nothing } else if(tag == FINISH_JOB) { @@ -213,7 +213,7 @@ void Mother::receive(Message & message) { } else if(tag == m_responseTag) { if(m_responseTag == SHUTDOWN_OK) { - // do nothing + // does nothing } else if(m_responseTag == MERGE_GRAM_MATRIX_OK) { // All mothers merged their GRAM MATRIX @@ -223,9 +223,9 @@ void Mother::receive(Message & message) { } } else if(m_responseTag == MERGE_KMER_MATRIX_OK) { - // do nothing + // does nothing } else if(m_responseTag == FLUSH_AGGREGATOR_RETURN) { - // do nothing + // does nothing } // every mother was not informed. @@ -260,7 +260,7 @@ void Mother::sendMessageWithReply(int & actor, int tag) { message.setBuffer(&m_kmerMatrixOwner); message.setNumberOfBytes(sizeof(m_kmerMatrixOwner)); } else if(tag == FLUSH_AGGREGATOR) { - // do nothing + // does nothing } send(actor, message); From bd004c5644da6964700744607f2617f893f4c1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Tue, 14 Apr 2015 23:07:46 -0400 Subject: [PATCH 08/23] edit output the name of the actors for debugging purpose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/CoalescenceManager.cpp | 8 ++++---- code/Surveyor/GenomeAssemblyReader.cpp | 9 ++++----- code/Surveyor/GenomeGraphReader.cpp | 13 ++++++------- code/Surveyor/KmerMatrixOwner.cpp | 5 +++++ code/Surveyor/MatrixOwner.cpp | 6 +++--- code/Surveyor/Mother.cpp | 19 ++++++++++--------- code/Surveyor/StoreKeeper.cpp | 25 +++++++++++++------------ 7 files changed, 45 insertions(+), 40 deletions(-) diff --git a/code/Surveyor/CoalescenceManager.cpp b/code/Surveyor/CoalescenceManager.cpp index ba2c9431..89852678 100644 --- a/code/Surveyor/CoalescenceManager.cpp +++ b/code/Surveyor/CoalescenceManager.cpp @@ -97,7 +97,7 @@ void CoalescenceManager::receive(Message & message) { if(m_kmerLength != kmerLength) { printName(); - cout << " Error: the k-mer length is not the same in all input files !"; + cout << "[CoalescenceManager] ERROR: the k-mer length is not the same in all input files !"; cout << endl; } @@ -172,8 +172,8 @@ void CoalescenceManager::receive(Message & message) { m_storeLastActor = last; printName(); - cout << " is now acquainted with StoreKeeper actors from "; - cout << m_storeFirstActor << " to " << m_storeLastActor << endl; + cout << "[CoalescenceManager] is now acquainted with [StoreKeepers] from #"; + cout << m_storeFirstActor << " to #" << m_storeLastActor << endl; // allocate buffers too @@ -368,7 +368,7 @@ bool CoalescenceManager::addKmerInBuffer(int producer, int & actor, int & sample #if 0 printName(); - cout << "sends bits to StoreKeeper # " << storageDestination; + cout << "sends bits to [StoreKeeper] #" << storageDestination; cout << endl; #endif diff --git a/code/Surveyor/GenomeAssemblyReader.cpp b/code/Surveyor/GenomeAssemblyReader.cpp index c6a8da80..4e509fe1 100644 --- a/code/Surveyor/GenomeAssemblyReader.cpp +++ b/code/Surveyor/GenomeAssemblyReader.cpp @@ -82,7 +82,7 @@ void GenomeAssemblyReader::startParty(Message & message) { m_loaded = 0; printName(); - cout <<"opens file " << m_fileName << endl; + cout << " [AssemblyReader] opens file " << m_fileName << endl; m_parent = message.getSourceActor(); @@ -221,17 +221,16 @@ void GenomeAssemblyReader::manageCommunicationForNewKmer(string & sequence, Cove #if 0 printName(); - cout << "DEBUG sending PAYLOAD to " << m_aggregator; + cout << " DEBUG sending PAYLOAD to " << m_aggregator; cout << " with " << position << " bytes "; vertex.print(sequence.length(), false); cout << endl; #endif int period = 1000000; - if(m_loaded % period == 0) { + if(m_loaded % period == 0 && m_loaded > 0) { printName(); - cout << " loaded " << m_loaded << " sequences" << endl; - + cout << " [AssemblyReader] loaded " << m_loaded << " sequences" << endl; } m_loaded ++; send(m_aggregator, message); diff --git a/code/Surveyor/GenomeGraphReader.cpp b/code/Surveyor/GenomeGraphReader.cpp index ec7c6689..990e3e43 100644 --- a/code/Surveyor/GenomeGraphReader.cpp +++ b/code/Surveyor/GenomeGraphReader.cpp @@ -83,7 +83,7 @@ void GenomeGraphReader::startParty(Message & message) { m_loaded = 0; printName(); - cout <<"opens file " << m_fileName << endl; + cout << " [GraphReader] opens file " << m_fileName << endl; m_parent = message.getSourceActor(); @@ -120,11 +120,11 @@ void GenomeGraphReader::readLine() { printName(); if(m_bad) { - cout << " Error: file " << m_fileName << " does not exist"; + cout << " [GraphReader] Error: file " << m_fileName << " does not exist"; cout << endl; } else { - cout << " finished reading file " << m_fileName; + cout << " [GraphReader] finished reading file " << m_fileName; cout << " got " << m_loaded << " objects" << endl; } @@ -250,17 +250,16 @@ void GenomeGraphReader::readLine() { #if 0 printName(); - cout << "DEBUG sending PAYLOAD to " << m_aggregator; + cout << " DEBUG sending PAYLOAD to " << m_aggregator; cout << " with " << position << " bytes "; vertex.print(sequence.length(), false); cout << endl; #endif int period = 1000000; - if(m_loaded % period == 0) { + if(m_loaded % period == 0 && m_loaded > 0) { printName(); - cout << " loaded " << m_loaded << " sequences" << endl; - + cout << " [GraphReader] loaded " << m_loaded << " sequences" << endl; } m_loaded ++; send(m_aggregator, message); diff --git a/code/Surveyor/KmerMatrixOwner.cpp b/code/Surveyor/KmerMatrixOwner.cpp index fb1f8660..f7d2829c 100644 --- a/code/Surveyor/KmerMatrixOwner.cpp +++ b/code/Surveyor/KmerMatrixOwner.cpp @@ -119,6 +119,11 @@ void KmerMatrixOwner::receive(Message & message) { coolMessage.setTag(KMER_MATRIX_IS_READY); send(m_mother, coolMessage); m_kmerMatrixFile.close(); + + string kmerMatrix = m_kmerMatrix.str(); + printName(); + cout << "[KmerMatrixOwner] printed the Kmers Matrix: "; + cout << kmerMatrix << endl; } } diff --git a/code/Surveyor/MatrixOwner.cpp b/code/Surveyor/MatrixOwner.cpp index f3411bec..056e9f37 100644 --- a/code/Surveyor/MatrixOwner.cpp +++ b/code/Surveyor/MatrixOwner.cpp @@ -103,7 +103,7 @@ void MatrixOwner::receive(Message & message) { if(m_completedStoreActors == getSize()) { printName(); - cout << "MatrixOwner received " << m_receivedPayloads << " payloads" << endl; + cout << "[MatrixOwner] received " << m_receivedPayloads << " payloads" << endl; // create directory for Surveyor ostringstream matrixFile; @@ -126,7 +126,7 @@ void MatrixOwner::receive(Message & message) { similarityFile.close(); printName(); - cout << "MatrixOwner printed the Similarity Matrix: "; + cout << "[MatrixOwner] printed the Similarity Matrix: "; cout << similarityMatrix << endl; // create DistanceMatrix @@ -145,7 +145,7 @@ void MatrixOwner::receive(Message & message) { distanceFile.close(); printName(); - cout << "MatrixOwner printed the Distance Matrix: "; + cout << "[MatrixOwner] printed the Distance Matrix: "; cout << distanceMatrix << endl; diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index fef835e6..309f46cd 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -186,14 +186,14 @@ void Mother::receive(Message & message) { sendToFirstMother(SHUTDOWN, SHUTDOWN_OK); }else { printName(); - cout << "GRAM_MATRIX_IS_READY" << endl; + cout << "[BigMother] GRAM_MATRIX_IS_READY" << endl; m_matricesAreReady = true; } } else if(tag == KmerMatrixOwner::KMER_MATRIX_IS_READY) { printName(); - cout << "KMER_MATRIX_IS_READY" << endl; + cout << "[BigMother] KMER_MATRIX_IS_READY" << endl; if(m_matricesAreReady){ sendToFirstMother(SHUTDOWN, SHUTDOWN_OK); @@ -329,8 +329,7 @@ void Mother::boot(Message & message) { next += getSize(); printName(); - cout << " local Mother is " << getName() << ","; - cout << " friend is # " << next << endl; + cout << "[RankMother] friend of [RankMother] #" << next << endl; send(next, message2); @@ -392,7 +391,9 @@ void Mother::startSurveyor() { if(isRoot) { printName(); - cout << "samples= " << m_sampleNames.size() << endl; + cout << "[BigMother] I am the Survey's Goddess and I am watching you!" << endl; + printName(); + cout << "[BigMother] Total number of samples to compare: " << m_sampleNames.size() << endl; } @@ -457,7 +458,7 @@ void Mother::startSurveyor() { } printName(); - cout << " readers to spawn: " << m_filesToSpawn.size() << endl; + cout << "[RankMother] readers to spawn: " << m_filesToSpawn.size() << endl; m_fileIterator = 0; spawnReader(); @@ -530,7 +531,7 @@ void Mother::spawnMatrixOwner() { m_matrixOwner = matrixOwner->getName(); printName(); - cout << "Spawned MatrixOwner actor !" << m_matrixOwner << endl; + cout << "[BigMother] Spawned [MatrixOwner] actor #" << m_matrixOwner << endl; // tell the StoreKeeper actors to send their stuff to the MatrixOwner actor // bigMother will wait for a signal from MatrixOwner @@ -557,14 +558,14 @@ void Mother::spawnMatrixOwner() { void Mother::spawnKmerMatrixOwner() { - // spawn the MatrixOwner here ! + // spawn the KmerMatrixOwner here ! KmerMatrixOwner * kmerMatrixOwner = new KmerMatrixOwner(); spawn(kmerMatrixOwner); m_kmerMatrixOwner = kmerMatrixOwner->getName(); printName(); - cout << "Spawned KmerMatrixOwner actor !" << m_kmerMatrixOwner << endl; + cout << "[BigMother] Spawned [KmerMatrixOwner] actor #" << m_kmerMatrixOwner << endl; // tell the StoreKeeper actors to send their stuff to the KmerMatrixOwner actor // bigMother will wait for a signal from KmerMatrixOwner diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index c583fd33..9ead8e0d 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -82,7 +82,7 @@ void StoreKeeper::receive(Message & message) { } else if( tag == CoalescenceManager::DIE) { printName(); - cout << "(StoreKeeper) received " << m_receivedObjects << " objects in total"; + cout << "[StoreKeeper] received " << m_receivedObjects << " objects in total"; cout << " with " << m_receivedPushes << " push operations" << endl; @@ -90,11 +90,11 @@ void StoreKeeper::receive(Message & message) { uint64_t size = m_hashTable.size() * 2; printName(); - cout << "has " << size << " Kmer objects in MyHashTable instance (final)" << endl; + cout << "[StoreKeeper] has " << size << " Kmer objects in MyHashTable instance (final)" << endl; printName(); - cout << "will now die (StoreKeeper)" << endl; + cout << "[StoreKeeper] will now die " << endl; die(); @@ -103,7 +103,7 @@ void StoreKeeper::receive(Message & message) { #if 0 printName(); cout << "DEBUG at MERGE_GRAM_MATRIX message reception "; - cout << "(StoreKeeper) received " << m_receivedObjects << " objects in total"; + cout << "[StoreKeeper] received " << m_receivedObjects << " objects in total"; cout << " with " << m_receivedPushes << " push operations" << endl; #endif computeLocalGramMatrix(); @@ -158,7 +158,7 @@ void StoreKeeper::receive(Message & message) { if(kmerLength != m_kmerLength) { - cout << "Error: the k-mer value is different this time !" << endl; + cout << "[StoreKeeper] ERROR: the k-mer value is different this time !" << endl; } // the color space mode is an artefact. @@ -231,6 +231,7 @@ void StoreKeeper::sendMatrixCell() { m_localGramMatrix.clear(); printName(); + cout << "[StoreKeeper] local Gram Matrix clearance!" << endl; Message response; response.setTag(MatrixOwner::PUSH_PAYLOAD_END); @@ -262,7 +263,7 @@ void StoreKeeper::configureHashTable() { void StoreKeeper::printColorReport() { printName(); - cout << "Coloring Report:" << endl; + cout << "Coloring Report: " << endl; m_colorSet.printColors(&cout); } @@ -382,10 +383,10 @@ void StoreKeeper::computeLocalGramMatrix() { #if 0 printName(); - cout << "DEBUG checksum " << sum << endl; + cout << " DEBUG checksum " << sum << endl; uint64_t size = m_hashTable.size(); - cout << "DEBUG m_hashTable.size() " << size << endl; + cout << " DEBUG m_hashTable.size() " << size << endl; #endif } @@ -394,7 +395,7 @@ void StoreKeeper::computeLocalGramMatrix() { void StoreKeeper::printLocalGramMatrix() { printName(); - cout << "Local Gram Matrix: " << endl; + cout << "[StoreKeeper] Local Gram Matrix:" << endl; cout << endl; for(map >::iterator column = m_localGramMatrix.begin(); @@ -490,7 +491,7 @@ void StoreKeeper::pushSampleVertex(Message & message) { void StoreKeeper::printStatus() { printName(); - cout << "(StoreKeeper) received " << m_receivedObjects << " objects so far !" << endl; + cout << "[StoreKeeper] received " << m_receivedObjects << " objects so far !" << endl; } void StoreKeeper::storeData(Vertex & vertex, int & sample) { @@ -528,7 +529,7 @@ void StoreKeeper::storeData(Vertex & vertex, int & sample) { if(size % period == 0 && size != m_lastSize) { printName(); - cout << "has " << size << " Kmer objects in MyHashTable instance" << endl; + cout << " has " << size << " Kmer objects in MyHashTable instance" << endl; m_lastSize = size; } @@ -571,7 +572,7 @@ void StoreKeeper::storeData(Vertex & vertex, int & sample) { #ifdef CONFIG_ASSERT2 if(oldVirtualColor == newVirtualColor) { - cout << "new sampleColor " << sampleColor << endl; + cout << " new sampleColor " << sampleColor << endl; //cout << "References " << m_colorSet.getNumberOfReferences(newVirtualColor); cout << endl; From 1cacd6ce87a02c4f45a94fa65659e5e8099772b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Wed, 15 Apr 2015 10:44:29 -0400 Subject: [PATCH 09/23] edit refactored the Reades comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/GenomeAssemblyReader.cpp | 32 +++++++------------------- code/Surveyor/GenomeAssemblyReader.h | 1 - code/Surveyor/GenomeGraphReader.cpp | 31 ++++++++----------------- 3 files changed, 17 insertions(+), 47 deletions(-) diff --git a/code/Surveyor/GenomeAssemblyReader.cpp b/code/Surveyor/GenomeAssemblyReader.cpp index 4e509fe1..73385494 100644 --- a/code/Surveyor/GenomeAssemblyReader.cpp +++ b/code/Surveyor/GenomeAssemblyReader.cpp @@ -38,7 +38,7 @@ using namespace std; #include -// TODO: need to know the kmer size +// DONE: need to know the kmer size GenomeAssemblyReader::GenomeAssemblyReader() { @@ -52,21 +52,12 @@ void GenomeAssemblyReader::receive(Message & message) { int type = message.getTag(); - /* - printName(); - cout << "received tag " << type << endl; - */ - if(type == START_PARTY) { startParty(message); } else if(type == CoalescenceManager::PAYLOAD_RESPONSE) { - /* - printName(); - cout << " DEBUG readLine because PAYLOAD_RESPONSE" << endl; - */ // read the next line now ! readKmer(); } @@ -82,7 +73,7 @@ void GenomeAssemblyReader::startParty(Message & message) { m_loaded = 0; printName(); - cout << " [AssemblyReader] opens file " << m_fileName << endl; + cout << "[AssemblyReader] opens file " << m_fileName << endl; m_parent = message.getSourceActor(); @@ -105,9 +96,6 @@ void GenomeAssemblyReader::readKmer() { string badParent = ""; string badChild = ""; - // ofstream outFile; - // outFile.open("kmers-created.txt", ios::app); - if(m_kmerReader.hasAnotherKmer()){ m_kmerReader.fetchNextKmer(sequence); @@ -206,14 +194,10 @@ void GenomeAssemblyReader::manageCommunicationForNewKmer(string & sequence, Cove position += sizeof(m_sample); -// maybe: accumulate many objects before flushing it. -// we can go up to MAXIMUM_MESSAGE_SIZE_IN_BYTES bytes. + // maybe: accumulate many objects before flushing it. + // we can go up to MAXIMUM_MESSAGE_SIZE_IN_BYTES bytes. - /* - printName(); - cout << " got data line " << buffer; - cout << " sending PAYLOAD to " << m_aggregator << endl; - */ + // Sending PAYLOAD to the CoalescenceManager Message message; message.setTag(CoalescenceManager::PAYLOAD); message.setBuffer(messageBuffer); @@ -221,7 +205,7 @@ void GenomeAssemblyReader::manageCommunicationForNewKmer(string & sequence, Cove #if 0 printName(); - cout << " DEBUG sending PAYLOAD to " << m_aggregator; + cout << "DEBUG sending PAYLOAD to " << m_aggregator; cout << " with " << position << " bytes "; vertex.print(sequence.length(), false); cout << endl; @@ -230,7 +214,7 @@ void GenomeAssemblyReader::manageCommunicationForNewKmer(string & sequence, Cove int period = 1000000; if(m_loaded % period == 0 && m_loaded > 0) { printName(); - cout << " [AssemblyReader] loaded " << m_loaded << " sequences" << endl; + cout << "[AssemblyReader] loaded " << m_loaded << " sequences" << endl; } m_loaded ++; send(m_aggregator, message); @@ -245,7 +229,7 @@ void GenomeAssemblyReader::setFileName(string & fileName, int sample) { #if 0 printName(); - cout << " DEBUG setFileName " << m_fileName << endl; + cout << "DEBUG setFileName " << m_fileName << endl; #endif } diff --git a/code/Surveyor/GenomeAssemblyReader.h b/code/Surveyor/GenomeAssemblyReader.h index 1ddaa4de..365fb3d2 100644 --- a/code/Surveyor/GenomeAssemblyReader.h +++ b/code/Surveyor/GenomeAssemblyReader.h @@ -22,7 +22,6 @@ #define GenomeAssemblyReaderHeader #include -/* #include */ #include "GenomeAssemblyReader.h" #include "CoalescenceManager.h" diff --git a/code/Surveyor/GenomeGraphReader.cpp b/code/Surveyor/GenomeGraphReader.cpp index 990e3e43..0dadd2cd 100644 --- a/code/Surveyor/GenomeGraphReader.cpp +++ b/code/Surveyor/GenomeGraphReader.cpp @@ -18,7 +18,7 @@ along with Ray Surveyor. If not, see . */ -// TODO: validate that the kmer length is the same for this file +// DONE: validate that the kmer length is the same for this file // and the provided -k argument #include "GenomeGraphReader.h" @@ -47,21 +47,12 @@ void GenomeGraphReader::receive(Message & message) { int type = message.getTag(); - /* - printName(); - cout << "received tag " << type << endl; -*/ - if(type == START_PARTY) { startParty(message); } else if(type == CoalescenceManager::PAYLOAD_RESPONSE) { - /* - printName(); - cout << " DEBUG readLine because PAYLOAD_RESPONSE" << endl; - */ // read the next line now ! readLine(); } @@ -83,7 +74,7 @@ void GenomeGraphReader::startParty(Message & message) { m_loaded = 0; printName(); - cout << " [GraphReader] opens file " << m_fileName << endl; + cout << "[GraphReader] opens file " << m_fileName << endl; m_parent = message.getSourceActor(); @@ -120,11 +111,11 @@ void GenomeGraphReader::readLine() { printName(); if(m_bad) { - cout << " [GraphReader] Error: file " << m_fileName << " does not exist"; + cout << "[GraphReader] Error: file " << m_fileName << " does not exist"; cout << endl; } else { - cout << " [GraphReader] finished reading file " << m_fileName; + cout << "[GraphReader] finished reading file " << m_fileName; cout << " got " << m_loaded << " objects" << endl; } @@ -235,14 +226,10 @@ void GenomeGraphReader::readLine() { position += sizeof(m_sample); -// maybe: accumulate many objects before flushing it. -// we can go up to MAXIMUM_MESSAGE_SIZE_IN_BYTES bytes. + // maybe: accumulate many objects before flushing it. + // we can go up to MAXIMUM_MESSAGE_SIZE_IN_BYTES bytes. - /* - printName(); - cout << " got data line " << buffer; - cout << " sending PAYLOAD to " << m_aggregator << endl; -*/ + // Sending PAYLOAD to the CoalescenceManager Message message; message.setTag(CoalescenceManager::PAYLOAD); message.setBuffer(messageBuffer); @@ -250,7 +237,7 @@ void GenomeGraphReader::readLine() { #if 0 printName(); - cout << " DEBUG sending PAYLOAD to " << m_aggregator; + cout << "DEBUG sending PAYLOAD to " << m_aggregator; cout << " with " << position << " bytes "; vertex.print(sequence.length(), false); cout << endl; @@ -259,7 +246,7 @@ void GenomeGraphReader::readLine() { int period = 1000000; if(m_loaded % period == 0 && m_loaded > 0) { printName(); - cout << " [GraphReader] loaded " << m_loaded << " sequences" << endl; + cout << "[GraphReader] loaded " << m_loaded << " sequences" << endl; } m_loaded ++; send(m_aggregator, message); From 1d8bc84e6a8a7085f038430a72e4eee0844b9b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Thu, 16 Apr 2015 11:50:30 -0400 Subject: [PATCH 10/23] edit comments in Storekeeper.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/StoreKeeper.cpp | 45 ++--------------------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 9ead8e0d..9004c0af 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -113,10 +113,6 @@ void StoreKeeper::receive(Message & message) { memcpy(&m_matrixOwner, buffer, sizeof(m_matrixOwner)); -/* - printName(); - cout << "DEBUG m_matrixOwner " << m_matrixOwner << endl; -*/ m_iterator1 = m_localGramMatrix.begin(); if(m_iterator1 != m_localGramMatrix.end()) { @@ -310,7 +306,6 @@ void StoreKeeper::computeLocalGramMatrix() { // since people are going to use this to check // for genome size, don't duplicate counts - // bool reportTwoDNAStrands = false; #if 0 @@ -345,7 +340,7 @@ void StoreKeeper::computeLocalGramMatrix() { if(reportTwoDNAStrands) hits *= 2; - // TODO: samples could be ordered and stored in a vector + // DONE: samples could be ordered and stored in a vector // to stop as soon as j > i // Complexity: quadratic in the number of samples -> physical colors of the virtual color. @@ -376,7 +371,7 @@ void StoreKeeper::computeLocalGramMatrix() { #if 0 sum += hits; #endif - //m_localGramMatrix[sample2Index][sample1Index] += hits; + } } } @@ -439,11 +434,6 @@ void StoreKeeper::pushSampleVertex(Message & message) { bytes -= sizeof(producer); memcpy(&producer, buffer + bytes, sizeof(producer)); - /* - printName(); - cout << "Received payload, last producer was " << producer << endl; - */ - while(position < bytes) { Vertex vertex; @@ -455,17 +445,6 @@ void StoreKeeper::pushSampleVertex(Message & message) { storeData(vertex, sample); - /* - printName(); - cout << " DEBUG received "; - cout << "(from " << message.getSourceActor(); - cout << ") "; - cout << "vertex for sample " << sample; - cout << " with sequence "; - vertex.print(m_kmerLength, m_colorSpaceMode); - cout << endl; - */ - m_receivedObjects ++; if(m_receivedObjects % 1000000 == 0) { @@ -538,7 +517,6 @@ void StoreKeeper::storeData(Vertex & vertex, int & sample) { cout << "DEBUG Growth -> " << before << " -> " << size << endl; #endif - // add the PhysicalKmerColor to the node. PhysicalKmerColor sampleColor = sample; @@ -573,7 +551,6 @@ void StoreKeeper::storeData(Vertex & vertex, int & sample) { if(oldVirtualColor == newVirtualColor) { cout << " new sampleColor " << sampleColor << endl; - //cout << "References " << m_colorSet.getNumberOfReferences(newVirtualColor); cout << endl; cout << " >>> Old samples " << oldSamples.size () << endl; @@ -621,24 +598,6 @@ void StoreKeeper::storeData(Vertex & vertex, int & sample) { m_colorSet.incrementReferences(newVirtualColor); m_colorSet.decrementReferences(oldVirtualColor); - /* - LargeCount referencesForOld = m_colorSet.getNumberOfReferences(oldVirtualColor); - LargeCount referencesForNew = m_colorSet.getNumberOfReferences(newVirtualColor); - -#if 1 - printName(); - cout << "DEBUG Kmer " << kmer.idToWord(m_kmerLength, m_colorSpaceMode); - cout << " Sample " << sample << endl; - cout << " DEBUG Lower " << lowerKey.idToWord(m_kmerLength, m_colorSpaceMode) << endl; -#endif - - - - cout << "DEBUG referencesForOld " << referencesForOld; - cout << " " << " referencesForNew " << referencesForNew; - cout << endl; - - */ } From 74ca0ad5d715f2681fd88f019d6cb9a9bbdfe4bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Thu, 30 Apr 2015 14:31:22 -0400 Subject: [PATCH 11/23] add BUILD to .gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 892eba64..c466908d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ Ray test/ PREFIX +BUILD \ No newline at end of file From 9eb236fb1fc3f2d28a9a417a0d27a24629297fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 25 May 2015 11:51:00 -0400 Subject: [PATCH 12/23] fix: segfault of message StoreKeeper->KmerMatrixOwner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/KmerMatrixOwner.cpp | 18 ++++++++++++------ code/Surveyor/KmerMatrixOwner.h | 2 +- code/Surveyor/StoreKeeper.cpp | 21 +++++++++++++++------ 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/code/Surveyor/KmerMatrixOwner.cpp b/code/Surveyor/KmerMatrixOwner.cpp index f7d2829c..29c909e8 100644 --- a/code/Surveyor/KmerMatrixOwner.cpp +++ b/code/Surveyor/KmerMatrixOwner.cpp @@ -26,6 +26,7 @@ #include #include #include +#include using namespace std; #include @@ -70,17 +71,18 @@ void KmerMatrixOwner::receive(Message & message) { } else if(tag == PUSH_KMER_SAMPLES) { - vector samplesWithKmer; + vector samplesWithKmer; int offset = 0; Kmer kmer; offset += kmer.load(buffer); int numberOfSamples = m_sampleNames->size(); + char * bufferForSamples = buffer + offset; for(int i=0; i samplesWithKmer; + vector samplesWithKmer; int offset = 0; Kmer kmer; offset += kmer.load(buffer); int numberOfSamples = m_sampleNames->size(); + char * bufferForSamples = buffer + offset; for(int i=0; i & samplesWithKmer, bool force) { +void KmerMatrixOwner::dumpKmerMatrixBuffer(Kmer & kmer, vector & samplesWithKmer, bool force) { + m_kmerMatrix << kmer.idToWord(m_parameters->getWordSize(),0); - for(int i =0; i < (signed) samplesWithKmer.size(); ++i){ + // int checkpoint = 500; + for(int i=0; i < (signed) samplesWithKmer.size(); ++i){ m_kmerMatrix << "\t" << samplesWithKmer[i]; } + m_kmerMatrix << endl; flushFileOperationBuffer(force, &m_kmerMatrix, &m_kmerMatrixFile, CONFIG_FILE_IO_BUFFER_SIZE); diff --git a/code/Surveyor/KmerMatrixOwner.h b/code/Surveyor/KmerMatrixOwner.h index 5d918a3f..0e800e8d 100644 --- a/code/Surveyor/KmerMatrixOwner.h +++ b/code/Surveyor/KmerMatrixOwner.h @@ -43,7 +43,7 @@ class KmerMatrixOwner : public Actor { ostringstream m_kmerMatrix; ofstream m_kmerMatrixFile; - void dumpKmerMatrixBuffer(Kmer & kmer, vector & samplesWithKmer, bool force); + void dumpKmerMatrixBuffer(Kmer & kmer, vector & samplesWithKmer, bool force); void createKmersMatrixOutputFile(); void printMatrixHeader(); diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 9004c0af..37866863 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -32,6 +32,8 @@ #include #include #include +#include + using namespace std; #include @@ -608,13 +610,19 @@ void StoreKeeper::setSampleSize(int sampleSize) { void StoreKeeper::sendKmersSamples() { - char buffer[MAXIMUM_MESSAGE_SIZE_IN_BYTES]; + char buffer[m_sampleSize+256]; int bytes = 0; ExperimentVertex * currentVertex = NULL; VirtualKmerColorHandle currentVirtualColor = NULL_VIRTUAL_COLOR; - vector samplesVector (m_sampleSize, false); + char samplesArray[m_sampleSize+1]; + samplesArray[m_sampleSize] = '\0'; + + // Set all samples to 0 + for (int i=0; i:: iterator sampleIterator = samples->begin(); sampleIterator != samples->end(); ++sampleIterator) { PhysicalKmerColor value = *sampleIterator; - samplesVector[value] = true; + samplesArray[value] = '1'; } - for (std::vector::iterator it = samplesVector.begin(); - it != samplesVector.end(); ++it) { - buffer[bytes] = *it; + + for (int i=0; i Date: Mon, 22 Jun 2015 10:38:43 -0400 Subject: [PATCH 13/23] add multiple matrices output for filters [DEBUG] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- .gitignore | 3 +- code/Mock/Parameters.cpp | 24 +++++--- code/Surveyor/MatrixOwner.cpp | 9 ++- code/Surveyor/MatrixOwner.h | 6 ++ code/Surveyor/Mother.cpp | 102 ++++++++++++++++++++++++++++------ code/Surveyor/Mother.h | 18 ++++++ code/Surveyor/StoreKeeper.cpp | 4 +- code/Surveyor/StoreKeeper.h | 1 + 8 files changed, 138 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index c466908d..fa3a6ceb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ Ray test/ PREFIX -BUILD \ No newline at end of file +BUILD +REFIX \ No newline at end of file diff --git a/code/Mock/Parameters.cpp b/code/Mock/Parameters.cpp index 0fecbc02..b1bdfbe5 100644 --- a/code/Mock/Parameters.cpp +++ b/code/Mock/Parameters.cpp @@ -1611,22 +1611,30 @@ void Parameters::showUsage(){ showOptionDescription("RayOutput/Surveyor/DistanceMatrix.tsv is a distance matrix (kernel-based)."); cout << endl; - showOption("-read-sample-graph SampleName SampleGraphFile", "Reads a sample graph (generated with -write-kmers)"); + showOption("-read-sample-graph SampleName SampleGraphFile", "Reads a sample graph (generated with -write-kmers from a Ray's assembly)"); cout<" << endl; + cout << endl; + + showOption("-filter-in-[assembly|graph]- SampleName Sample[Assembly|Graph]File", "Incorporate only these dataset kmers when building the similarity and distance matrices"); + showOptionDescription("If there's multiple instances of this option than the kmer will be incorporate if present in one dataset"); + showOptionDescription("No need to read the dataset as a sample before using this option"); + showOptionDescription("X is the number of the filter"); - showOption("-filter-in-[assembly|graph] SampleName Sample[Assembly|Graph]File", "Incorporate only the sample kmers in the similarity and distance matrices"); - showOptionDescription("If there's multiple instances of this option than the kmer will be incorporate if present in one sample"); - showOptionDescription("No need to read the sample before using this option"); cout< SampleName Sample[Assembly|Graph]File", "Filter out the dataset kmers when building the similarity and distance matrices"); + showOptionDescription("No need to read the dataset as a sample before using this option"); + showOptionDescription("X is the number of the filter"); cout< > m_localGramMatrix; map > m_kernelDistanceMatrix; + typedef map > localGramMatrix; + typedef map > localDistMatrix; + + vector gramMatrices; + vector distMatrices; + int m_mother; int m_completedStoreActors; diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index 309f46cd..a17a779e 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -347,6 +347,9 @@ void Mother::startSurveyor() { // Set matricesAreReady to true in case user doesn't want // to print out kmers matrix. m_matricesAreReady = true; + int matricesIterator = 0; + int filterTypes [4] = {INPUT_FILTERIN_GRAPH, INPUT_FILTEROUT_GRAPH, INPUT_FILTERIN_ASSEMBLY, INPUT_FILTEROUT_ASSEMBLY}; + int typeIndex = 0; vector * commands = m_parameters->getCommands(); @@ -358,29 +361,96 @@ void Mother::startSurveyor() { // DONE: Check bounds for file names map fastTable; + int filterIndex = 0; - fastTable["-read-sample-graph"] = INPUT_TYPE_GRAPH; - fastTable["-filter-in-graph"] = INPUT_FILTERIN_GRAPH; - fastTable["-filter-out-graph"] = INPUT_FILTEROUT_GRAPH; - fastTable["-read-sample-assembly"] = INPUT_TYPE_ASSEMBLY; - fastTable["-filter-in-assembly"] = INPUT_FILTERIN_ASSEMBLY; - fastTable["-filter-out-assembly"] = INPUT_FILTEROUT_ASSEMBLY; + if (element.find("-filter") != string::npos) { - // Unsupported option - if(fastTable.count(element) == 0 || i+2 > (int) commands->size()) - continue; + int i = 4; + char * filterStr = new char[element.length()+1]; + char * strIt; - string sampleName = commands->at(++i); - string fileName = commands->at(++i); + strcpy(filterStr, element.c_str()); - m_sampleNames.push_back(sampleName); + strIt = strtok(filterStr, "-"); - // DONE implement this m_assemblyFileNames + type - m_inputFileNames.push_back(fileName); + while (strIt != NULL && i > 0) { + cout << "DEBUG: iterator " << i << " filterStr " << strIt << endl; + strIt = strtok (NULL, "-"); - int type = fastTable[element]; + // filter_cmd = tolower(*strIt); - m_sampleInputTypes.push_back(type); + // m_filterMatrices[matrices_iterator] = [strIt][]; + // -filter-in-graph-1 + + switch (i) { + case 4: + filterIndex = atoi(strIt); + break; + case 3 : + if (strcasecmp(strIt,"in") != 0) + typeIndex += 1; + break; + case 2 : + if (strcasecmp(strIt,"graph") != 0) + typeIndex += 2; + break; + case 1 : + // filter word + break; + + } + + i = i - 1; + } + + map>::iterator it = m_filterMatrices.find(filterIndex); + if (it != m_filterMatrices.end()) { + + } else { + vector inputSampleTypes; + inputSampleTypes.push_back(filterTypes[filterIndex]); + m_filterMatrices.insert (std::pair>(filterIndex,inputSampleTypes)); + } + + // m_filterMatrices + if(std::find(m_filterMatrices.begin(), m_filterMatrices.end(), item)!=m_filterMatrices.end()){ + // Find the item + } + + // Filter f; + // f.m_filterNumber = filterIndex; + // // f.m_sampleInputTypes = new vector; + + // m_filterMatrices.push_back(f); + + } else { + + fastTable["-read-sample-graph"] = INPUT_TYPE_GRAPH; + fastTable["-filter-in-graph"] = INPUT_FILTERIN_GRAPH; + fastTable["-filter-out-graph"] = INPUT_FILTEROUT_GRAPH; + + fastTable["-read-sample-assembly"] = INPUT_TYPE_ASSEMBLY; + fastTable["-filter-in-assembly"] = INPUT_FILTERIN_ASSEMBLY; + fastTable["-filter-out-assembly"] = INPUT_FILTEROUT_ASSEMBLY; + + + // Unsupported option + if(fastTable.count(element) == 0 || i+2 > (int) commands->size()) + continue; + + string sampleName = commands->at(++i); + string fileName = commands->at(++i); + + m_sampleNames.push_back(sampleName); + + // DONE implement this m_assemblyFileNames + type + m_inputFileNames.push_back(fileName); + + int type = fastTable[element]; + + m_sampleInputTypes.push_back(type); + + } } else { m_matricesAreReady = false; diff --git a/code/Surveyor/Mother.h b/code/Surveyor/Mother.h index 856adae8..27236f9b 100644 --- a/code/Surveyor/Mother.h +++ b/code/Surveyor/Mother.h @@ -75,8 +75,26 @@ class Mother: public Actor { vector m_sampleNames; vector m_inputFileNames; + vector m_sampleInputTypes; + /* int filterIdentifier; */ + /* int matrixIndex; */ + // map< matrixIndex, map< matrixNb for output + /* vector < map > m_filterMatrices; */ + + map< int, vector > m_filterMatrices; + + /* struct Filter{ */ + /* int m_filterNumber; */ + /* vector m_sampleInputTypes; */ + /* }; */ + + + /* vector m_filterMatrices; */ + + + int m_bigMother; int m_aliveReaders; int m_motherToKill; diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 37866863..82c5a109 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -110,7 +110,6 @@ void StoreKeeper::receive(Message & message) { #endif computeLocalGramMatrix(); - m_mother = source; memcpy(&m_matrixOwner, buffer, sizeof(m_matrixOwner)); @@ -118,7 +117,6 @@ void StoreKeeper::receive(Message & message) { m_iterator1 = m_localGramMatrix.begin(); if(m_iterator1 != m_localGramMatrix.end()) { - m_iterator2 = m_iterator1->second.begin(); } @@ -136,7 +134,7 @@ void StoreKeeper::receive(Message & message) { sendKmersSamples(); } else if (tag == KmerMatrixOwner::PUSH_KMER_SAMPLES_END) { - + // empty } else if(tag == KmerMatrixOwner::PUSH_KMER_SAMPLES_OK) { sendKmersSamples(); } else if(tag == CoalescenceManager::SET_KMER_INFO) { diff --git a/code/Surveyor/StoreKeeper.h b/code/Surveyor/StoreKeeper.h index 1381704e..9686f913 100644 --- a/code/Surveyor/StoreKeeper.h +++ b/code/Surveyor/StoreKeeper.h @@ -60,6 +60,7 @@ class StoreKeeper: public Actor { int m_mother; int m_matrixOwner; int m_kmerMatrixOwner; + vector * m_sampleInputTypes; bool m_configured; From 059b042be5cd5e2621f6b986ff976fc5a38611c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 20 Jul 2015 14:03:14 -0400 Subject: [PATCH 14/23] surveyor: many filters in 1 run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/CoalescenceManager.cpp | 4 +- code/Surveyor/MatrixOwner.cpp | 127 ++++++++++++++------ code/Surveyor/MatrixOwner.h | 9 +- code/Surveyor/Mother.cpp | 167 +++++++++++++++------------ code/Surveyor/Mother.h | 15 --- code/Surveyor/StoreKeeper.cpp | 121 +++++++++++-------- code/Surveyor/StoreKeeper.h | 8 +- 7 files changed, 267 insertions(+), 184 deletions(-) diff --git a/code/Surveyor/CoalescenceManager.cpp b/code/Surveyor/CoalescenceManager.cpp index 89852678..817fdbcf 100644 --- a/code/Surveyor/CoalescenceManager.cpp +++ b/code/Surveyor/CoalescenceManager.cpp @@ -91,13 +91,13 @@ void CoalescenceManager::receive(Message & message) { char * buffer = (char*)message.getBufferBytes(); memcpy(&kmerLength, buffer, sizeof(kmerLength)); + if(m_kmerLength == 0) m_kmerLength = kmerLength; if(m_kmerLength != kmerLength) { - printName(); - cout << "[CoalescenceManager] ERROR: the k-mer length is not the same in all input files !"; + cout << "[CoalescenceManager] ERROR: the k-mer length is not the same in all input files : " << m_kmerLength; cout << endl; } diff --git a/code/Surveyor/MatrixOwner.cpp b/code/Surveyor/MatrixOwner.cpp index f7a4c99f..c7582780 100644 --- a/code/Surveyor/MatrixOwner.cpp +++ b/code/Surveyor/MatrixOwner.cpp @@ -31,6 +31,13 @@ using namespace std; #include +#define INPUT_TYPE_GRAPH 0 +#define INPUT_FILTERIN_GRAPH 1 +#define INPUT_FILTEROUT_GRAPH 2 +#define INPUT_TYPE_ASSEMBLY 3 +#define INPUT_FILTERIN_ASSEMBLY 4 +#define INPUT_FILTEROUT_ASSEMBLY 5 + MatrixOwner::MatrixOwner() { m_completedStoreActors = 0; @@ -61,6 +68,43 @@ void MatrixOwner::receive(Message & message) { offset += sizeof(m_parameters); memcpy(&m_sampleNames, buffer + offset, sizeof(m_sampleNames)); offset += sizeof(m_sampleNames); + memcpy(&m_filterMatrices, buffer + offset, sizeof(m_filterMatrices)); + offset += sizeof(m_filterMatrices); + + // Build a m_sampleByFilter to only print samples and filters that belong to a filtered matrix + for (map< int, vector >::iterator it = m_filterMatrices->begin(); it!=m_filterMatrices->end(); ++it) { + // fill sampleByFilter with 1 (sample present in that matrix) + for (int z=0;zsecond.size();z++) { + m_sampleByFilter[it->first].push_back(1); + } + } + + for (map< int, vector >::iterator it = m_filterMatrices->begin(); it!=m_filterMatrices->end(); ++it) { + + // skip -1 complete no filter matrix + if (it->first == -1) + continue; + + // look for filter samples to hide in other matrices + vector samples_to_hide_in_other_matrix; + + for(vector::iterator sampleIt = it->second.begin(); sampleIt != it->second.end(); ++sampleIt) { + if (*sampleIt != INPUT_TYPE_GRAPH && *sampleIt != INPUT_TYPE_ASSEMBLY) { + samples_to_hide_in_other_matrix.push_back(distance(it->second.begin(), sampleIt)); + } + } + + for (map< int, vector >::iterator it2 = m_sampleByFilter.begin(); it2!=m_sampleByFilter.end(); ++it2) { + if(it2->first == -1 || it2->first == it->first) + continue; + + for(vector::iterator sampleIt2 = samples_to_hide_in_other_matrix.begin(); sampleIt2 != samples_to_hide_in_other_matrix.end(); ++sampleIt2) { + it2->second.at(*sampleIt2) = 0; + } + } + + } + #ifdef CONFIG_ASSERT assert(m_parameters != NULL); @@ -73,6 +117,7 @@ void MatrixOwner::receive(Message & message) { SampleIdentifier sample1 = -1; SampleIdentifier sample2 = -1; LargeCount count = 0; + int matrixNb; int offset = 0; @@ -82,6 +127,9 @@ void MatrixOwner::receive(Message & message) { offset += sizeof(sample2); memcpy(&count, buffer + offset, sizeof(count)); offset += sizeof(count); + memcpy(&matrixNb, buffer + offset, sizeof(matrixNb)); + offset += sizeof(matrixNb); + #ifdef CONFIG_ASSERT assert(sample1 >= 0); @@ -91,13 +139,11 @@ void MatrixOwner::receive(Message & message) { m_receivedPayloads ++; - m_localGramMatrix[sample1][sample2] += count; - if (gramMatrices.size() < 1) { - gramMatrices.push_back(m_localGramMatrix); - } else { - gramMatrices[0][sample1][sample2] += count; - } + m_gramMatrices[matrixNb][sample1][sample2] += count; + if (sample1 != sample2) { + m_gramMatrices[matrixNb][sample2][sample1] += count; + } Message response; response.setTag(PUSH_PAYLOAD_OK); @@ -122,19 +168,31 @@ void MatrixOwner::receive(Message & message) { createDirectory(surveyorDirectory.c_str()); } - // create SimilarityMatrix - matrixFile << "SimilarityMatrix.tsv"; - string similarityMatrix = matrixFile.str(); - ofstream similarityFile; - similarityFile.open(similarityMatrix.c_str()); - // printLocalGramMatrix(similarityFile, m_localGramMatrix); - printLocalGramMatrix(similarityFile, gramMatrices[0]); - similarityFile.close(); + // print all SimilarityMatrices + for (map::iterator it=m_gramMatrices.begin(); it!=m_gramMatrices.end(); ++it) { - printName(); - cout << "[MatrixOwner] printed the Similarity Matrix: "; - cout << similarityMatrix << endl; + // matrixFile << "SimilarityMatrix.tsv"; + string matrixNb = static_cast( &(ostringstream() << it->first) )->str(); + + string similarityMatrix = ""; + + if (it->first != -1) { + similarityMatrix = (matrixFile.str() + "SimilarityMatrix.filter-" + matrixNb + ".tsv"); + } else { + similarityMatrix = (matrixFile.str() + "SimilarityMatrix.global.tsv"); + } + + ofstream similarityFile; + + similarityFile.open(similarityMatrix.c_str()); + printLocalGramMatrix(similarityFile, it->second, m_sampleByFilter[it->first]); + similarityFile.close(); + + printName(); + cout << "[MatrixOwner] printed the Similarity Matrix: "; + cout << similarityMatrix << endl; + } // create DistanceMatrix @@ -142,13 +200,13 @@ void MatrixOwner::receive(Message & message) { ostringstream matrixFileForDistances; matrixFileForDistances << m_parameters->getPrefix() << "/Surveyor/"; - matrixFileForDistances << "DistanceMatrix.tsv"; + matrixFileForDistances << "DistanceMatrix.global.tsv"; string distanceMatrix = matrixFileForDistances.str(); ofstream distanceFile; distanceFile.open(distanceMatrix.c_str()); - printLocalGramMatrix(distanceFile, m_kernelDistanceMatrix); + printLocalGramMatrix(distanceFile, m_kernelDistanceMatrix, m_sampleByFilter[-1]); distanceFile.close(); printName(); @@ -163,8 +221,7 @@ void MatrixOwner::receive(Message & message) { // clear matrices - - m_localGramMatrix.clear(); + m_gramMatrices.clear(); m_kernelDistanceMatrix.clear(); } } @@ -174,8 +231,8 @@ void MatrixOwner::receive(Message & message) { // TODO: save time by only computing the lower triangle. void MatrixOwner::computeDistanceMatrix() { - for(map >::iterator row = m_localGramMatrix.begin(); - row != m_localGramMatrix.end(); ++row) { + for(map >::iterator row = m_gramMatrices[-1].begin(); + row != m_gramMatrices[-1].end(); ++row) { SampleIdentifier sample1 = row->first; @@ -186,9 +243,9 @@ void MatrixOwner::computeDistanceMatrix() { // d(x, x') = sqrt( k(x,x) + k(x', x') - 2 k (x, x')) LargeCount distance = 0; - distance += m_localGramMatrix[sample1][sample1]; - distance += m_localGramMatrix[sample2][sample2]; - distance -= 2 * m_localGramMatrix[sample1][sample2]; + distance += m_gramMatrices[-1][sample1][sample1]; + distance += m_gramMatrices[-1][sample2][sample2]; + distance -= 2 * m_gramMatrices[-1][sample1][sample2]; distance = (LargeCount) sqrt((double)distance); @@ -200,12 +257,15 @@ void MatrixOwner::computeDistanceMatrix() { } } -void MatrixOwner::printLocalGramMatrix(ostream & stream, map > & matrix) { +void MatrixOwner::printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude) { int numberOfSamples = m_sampleNames->size(); for(int i = 0 ; i < numberOfSamples ; ++i) { + if (samplesToInclude[i] == 0) + continue; + string & sampleName1 = m_sampleNames->at(i); stream << " " << sampleName1; @@ -216,12 +276,18 @@ void MatrixOwner::printLocalGramMatrix(ostream & stream, mapat(i); stream << sampleName1; for(int j = 0 ; j < numberOfSamples ; ++j) { + if (samplesToInclude[j] == 0) + continue; + //string & sampleName2 = m_sampleNames->at(j); LargeCount hits = 0; @@ -244,12 +310,6 @@ void MatrixOwner::printLocalGramMatrix(ostream & stream, map > & matrix) { - /* - printName(); - cout << "Local Gram Matrix: " << endl; - cout << endl; - */ - for(map >::iterator column = matrix.begin(); column != matrix.end(); ++column) { @@ -280,4 +340,3 @@ void MatrixOwner::printLocalGramMatrixWithHash(ostream & stream, map * m_sampleNames; - map > m_localGramMatrix; map > m_kernelDistanceMatrix; typedef map > localGramMatrix; - typedef map > localDistMatrix; + map m_gramMatrices; - vector gramMatrices; - vector distMatrices; + map< int, vector > * m_filterMatrices; + map > m_sampleByFilter; int m_mother; int m_completedStoreActors; - void printLocalGramMatrix(ostream & stream, map > & matrix); + void printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude); void printLocalGramMatrixWithHash(ostream & stream, map > & matrix); void computeDistanceMatrix(); diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index a17a779e..a0e89e11 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -38,6 +38,7 @@ using namespace std; #define PLAN_RANK_ACTORS_PER_RANK 1 #define PLAN_MOTHER_ACTORS_PER_RANK 1 #define PLAN_GENOME_GRAPH_READER_ACTORS_PER_RANK 999999 + #define INPUT_TYPE_GRAPH 0 #define INPUT_FILTERIN_GRAPH 1 #define INPUT_FILTEROUT_GRAPH 2 @@ -347,118 +348,122 @@ void Mother::startSurveyor() { // Set matricesAreReady to true in case user doesn't want // to print out kmers matrix. m_matricesAreReady = true; - int matricesIterator = 0; int filterTypes [4] = {INPUT_FILTERIN_GRAPH, INPUT_FILTEROUT_GRAPH, INPUT_FILTERIN_ASSEMBLY, INPUT_FILTEROUT_ASSEMBLY}; - int typeIndex = 0; - - vector * commands = m_parameters->getCommands(); - for(int i = 0 ; i < (int) commands->size() ; ++i) { + map fastTable; - string & element = commands->at(i); + fastTable["-read-sample-graph"] = INPUT_TYPE_GRAPH; + fastTable["-read-sample-assembly"] = INPUT_TYPE_ASSEMBLY; - if (element != "-write-kmer-matrix") { - // DONE: Check bounds for file names + vector * commands = m_parameters->getCommands(); - map fastTable; - int filterIndex = 0; + vector sampleTypesTmpBuffer; - if (element.find("-filter") != string::npos) { + vector inputSampleTypes; + m_filterMatrices.insert (std::pair >(-1,inputSampleTypes)); - int i = 4; - char * filterStr = new char[element.length()+1]; - char * strIt; + for(int i = 0 ; i < (int) commands->size() ; ++i) { - strcpy(filterStr, element.c_str()); + string & element = commands->at(i); - strIt = strtok(filterStr, "-"); + if (element == "-write-kmer-matrix") { + m_matricesAreReady = false; + m_printKmerMatrix = true; + continue; + } - while (strIt != NULL && i > 0) { - cout << "DEBUG: iterator " << i << " filterStr " << strIt << endl; - strIt = strtok (NULL, "-"); + int filterIndex = 0; + int typeIndex = 0; - // filter_cmd = tolower(*strIt); + if (element.find("-filter") == 0) { - // m_filterMatrices[matrices_iterator] = [strIt][]; - // -filter-in-graph-1 + int j = 4; + char * filterStr = new char[element.length()+1]; + char * strIt; + string type = "graph"; - switch (i) { - case 4: - filterIndex = atoi(strIt); - break; - case 3 : - if (strcasecmp(strIt,"in") != 0) - typeIndex += 1; - break; - case 2 : - if (strcasecmp(strIt,"graph") != 0) - typeIndex += 2; - break; - case 1 : - // filter word - break; + strcpy(filterStr, element.c_str()); - } + strIt = strtok(filterStr, "-"); - i = i - 1; - } + while (strIt != NULL && j > 0) { - map>::iterator it = m_filterMatrices.find(filterIndex); - if (it != m_filterMatrices.end()) { - - } else { - vector inputSampleTypes; - inputSampleTypes.push_back(filterTypes[filterIndex]); - m_filterMatrices.insert (std::pair>(filterIndex,inputSampleTypes)); - } + strIt = strtok (NULL, "-"); - // m_filterMatrices - if(std::find(m_filterMatrices.begin(), m_filterMatrices.end(), item)!=m_filterMatrices.end()){ - // Find the item + switch (j) { + case 4: + if (strcmp(strIt,"in") != 0) { + typeIndex += 1; + } + break; + case 3 : + if (strcmp(strIt,"graph") != 0) { + typeIndex += 2; + type = "assembly"; + } + break; + case 2 : + filterIndex = atoi(strIt); + break; + case 1 : + // filter word + break; } - // Filter f; - // f.m_filterNumber = filterIndex; - // // f.m_sampleInputTypes = new vector; - - // m_filterMatrices.push_back(f); - - } else { + j = j - 1; + } - fastTable["-read-sample-graph"] = INPUT_TYPE_GRAPH; - fastTable["-filter-in-graph"] = INPUT_FILTERIN_GRAPH; - fastTable["-filter-out-graph"] = INPUT_FILTEROUT_GRAPH; - fastTable["-read-sample-assembly"] = INPUT_TYPE_ASSEMBLY; - fastTable["-filter-in-assembly"] = INPUT_FILTERIN_ASSEMBLY; - fastTable["-filter-out-assembly"] = INPUT_FILTEROUT_ASSEMBLY; + bool new_filter = true; + for (map >::iterator it = m_filterMatrices.begin(); it!= m_filterMatrices.end(); ++it) { + if (it->first == filterIndex) { + it->second.push_back(filterTypes[typeIndex]); + new_filter = false; + } else { + it->second.push_back(fastTable["-read-sample-"+type]); + } + } + if (new_filter == true) { + vector inputSampleTypes; + inputSampleTypes.insert(inputSampleTypes.end(),m_filterMatrices[-1].begin(),--m_filterMatrices[-1].end()); + inputSampleTypes.push_back(filterTypes[typeIndex]); + m_filterMatrices.insert (std::pair >(filterIndex,inputSampleTypes)); + } - // Unsupported option - if(fastTable.count(element) == 0 || i+2 > (int) commands->size()) - continue; + m_sampleInputTypes.push_back(fastTable["-read-sample-"+type]); - string sampleName = commands->at(++i); - string fileName = commands->at(++i); - m_sampleNames.push_back(sampleName); + // add samples + string sampleName = commands->at(++i); + string fileName = commands->at(++i); + m_sampleNames.push_back(sampleName); + m_inputFileNames.push_back(fileName); - // DONE implement this m_assemblyFileNames + type - m_inputFileNames.push_back(fileName); + } else if (element.find("-read-sample") == 0) { - int type = fastTable[element]; + // Unsupported option + if(fastTable.count(element) == 0 || i+2 > (int) commands->size()) + continue; - m_sampleInputTypes.push_back(type); + int type = fastTable[element]; + m_sampleInputTypes.push_back(type); + for (map< int, vector >::iterator it = m_filterMatrices.begin(); it != m_filterMatrices.end(); ++it) { + it->second.push_back(type); } - } else { - m_matricesAreReady = false; - m_printKmerMatrix = true; + // add samples + string sampleName = commands->at(++i); + string fileName = commands->at(++i); + m_sampleNames.push_back(sampleName); + m_inputFileNames.push_back(fileName); + } } + if(isRoot) { printName(); cout << "[BigMother] I am the Survey's Goddess and I am watching you!" << endl; @@ -498,6 +503,7 @@ void Mother::startSurveyor() { // send kmerLength and sampleInputTypes to localStore int kmerLength = m_parameters->getWordSize(); vector * sampleInputTypes = & m_sampleInputTypes; + map< int, vector > * filterMatrices = & m_filterMatrices; char buffer[32]; int offset = 0; @@ -505,6 +511,9 @@ void Mother::startSurveyor() { offset += sizeof(kmerLength); memcpy(buffer + offset, &sampleInputTypes, sizeof(sampleInputTypes)); offset += sizeof(sampleInputTypes); + memcpy(buffer + offset, &filterMatrices, sizeof(filterMatrices)); + offset += sizeof(filterMatrices); + Message kmerMessage; kmerMessage.setBuffer(&kmerLength); @@ -609,6 +618,7 @@ void Mother::spawnMatrixOwner() { Message greetingMessage; vector * names = & m_sampleNames; + map< int, vector > * filterMatrices = & m_filterMatrices; char buffer[32]; int offset = 0; @@ -616,6 +626,9 @@ void Mother::spawnMatrixOwner() { offset += sizeof(m_parameters); memcpy(buffer + offset, &names, sizeof(names)); offset += sizeof(names); + memcpy(buffer + offset, &filterMatrices, sizeof(filterMatrices)); + offset += sizeof(filterMatrices); + greetingMessage.setBuffer(&buffer); greetingMessage.setNumberOfBytes(offset); diff --git a/code/Surveyor/Mother.h b/code/Surveyor/Mother.h index 27236f9b..26c7b0b9 100644 --- a/code/Surveyor/Mother.h +++ b/code/Surveyor/Mother.h @@ -78,23 +78,8 @@ class Mother: public Actor { vector m_sampleInputTypes; - /* int filterIdentifier; */ - /* int matrixIndex; */ - // map< matrixIndex, map< matrixNb for output - /* vector < map > m_filterMatrices; */ - map< int, vector > m_filterMatrices; - /* struct Filter{ */ - /* int m_filterNumber; */ - /* vector m_sampleInputTypes; */ - /* }; */ - - - /* vector m_filterMatrices; */ - - - int m_bigMother; int m_aliveReaders; int m_motherToKill; diff --git a/code/Surveyor/StoreKeeper.cpp b/code/Surveyor/StoreKeeper.cpp index 82c5a109..11e7cec0 100644 --- a/code/Surveyor/StoreKeeper.cpp +++ b/code/Surveyor/StoreKeeper.cpp @@ -114,9 +114,16 @@ void StoreKeeper::receive(Message & message) { memcpy(&m_matrixOwner, buffer, sizeof(m_matrixOwner)); - m_iterator1 = m_localGramMatrix.begin(); + // m_iterator1 = m_localGramMatrix.begin(); - if(m_iterator1 != m_localGramMatrix.end()) { + // if(m_iterator1 != m_localGramMatrix.end()) { + // m_iterator2 = m_iterator1->second.begin(); + // } + + m_iterator0 = m_localGramMatrices.begin(); + m_iterator1 = m_localGramMatrices[m_iterator0->first].begin(); + + if(m_iterator1 != m_localGramMatrices[m_iterator0->first].end()) { m_iterator2 = m_iterator1->second.begin(); } @@ -147,13 +154,13 @@ void StoreKeeper::receive(Message & message) { position += sizeof(kmerLength); memcpy(&m_sampleInputTypes, buffer + position, sizeof(m_sampleInputTypes)); position += sizeof(m_sampleInputTypes); - + memcpy(&m_filterMatrices, buffer + position, sizeof(m_filterMatrices)); + position += sizeof(m_filterMatrices); if(m_kmerLength == 0) m_kmerLength = kmerLength; if(kmerLength != m_kmerLength) { - cout << "[StoreKeeper] ERROR: the k-mer value is different this time !" << endl; } @@ -180,51 +187,63 @@ void StoreKeeper::receive(Message & message) { void StoreKeeper::sendMatrixCell() { - if(m_iterator1 != m_localGramMatrix.end()) { + if (m_iterator0 != m_localGramMatrices.end()) { - if(m_iterator2 != m_iterator1->second.end()) { + if(m_iterator1 != m_localGramMatrices[m_iterator0->first].end()) { - SampleIdentifier sample1 = m_iterator1->first; - SampleIdentifier sample2 = m_iterator2->first; - LargeCount count = m_iterator2->second; + if(m_iterator2 != m_iterator1->second.end()) { - Message message; - char buffer[20]; - int offset = 0; - memcpy(buffer + offset, &sample1, sizeof(sample1)); - offset += sizeof(sample1); - memcpy(buffer + offset, &sample2, sizeof(sample2)); - offset += sizeof(sample2); - memcpy(buffer + offset, &count, sizeof(count)); - offset += sizeof(count); + SampleIdentifier sample1 = m_iterator1->first; + SampleIdentifier sample2 = m_iterator2->first; + LargeCount count = m_iterator2->second; + int matrixNb = m_iterator0->first; - message.setBuffer(buffer); - message.setNumberOfBytes(offset); - message.setTag(MatrixOwner::PUSH_PAYLOAD); + Message message; + char buffer[20]; + int offset = 0; + memcpy(buffer + offset, &sample1, sizeof(sample1)); + offset += sizeof(sample1); + memcpy(buffer + offset, &sample2, sizeof(sample2)); + offset += sizeof(sample2); + memcpy(buffer + offset, &count, sizeof(count)); + offset += sizeof(count); + memcpy(buffer + offset, &matrixNb, sizeof(matrixNb)); + offset += sizeof(matrixNb); - send(m_matrixOwner, message); + message.setBuffer(buffer); + message.setNumberOfBytes(offset); + message.setTag(MatrixOwner::PUSH_PAYLOAD); - m_iterator2++; + send(m_matrixOwner, message); - // end of the line - if(m_iterator2 == m_iterator1->second.end()) { + m_iterator2++; - m_iterator1++; + // end of the line + if(m_iterator2 == m_iterator1->second.end()) { - if(m_iterator1 != m_localGramMatrix.end()) { + m_iterator1++; - m_iterator2 = m_iterator1->second.begin(); + if(m_iterator1 != m_localGramMatrices[m_iterator0->first].end()) { + m_iterator2 = m_iterator1->second.begin(); + } else { + m_iterator0++; + if (m_iterator0 != m_localGramMatrices.end()) { + m_iterator1 = m_localGramMatrices[m_iterator0->first].begin(); + m_iterator2 = m_iterator1->second.begin(); + } + } } - } - return; + return; + } } + } // we processed all the matrix // free memory. - m_localGramMatrix.clear(); + m_localGramMatrices.clear(); printName(); cout << "[StoreKeeper] local Gram Matrix clearance!" << endl; @@ -302,8 +321,6 @@ void StoreKeeper::computeLocalGramMatrix() { // This directed survey only aims at counting colored kmers with colors // other than sample colors - bool filterOutKmer = checkKmerFilter(samples); - // since people are going to use this to check // for genome size, don't duplicate counts bool reportTwoDNAStrands = false; @@ -336,6 +353,11 @@ void StoreKeeper::computeLocalGramMatrix() { #endif + + // bool filterOutKmer = checkKmerFilter(samples); + bool filterOutKmer = false; + + // we have 2 DNA strands !!! if(reportTwoDNAStrands) hits *= 2; @@ -357,16 +379,17 @@ void StoreKeeper::computeLocalGramMatrix() { SampleIdentifier sample2Index = *sample2; - //if(sample2 < sample1) - // this is a diagonal matrix + for(map< int, vector >::iterator filter = m_filterMatrices->begin(); + filter != m_filterMatrices->end(); ++filter) { - if(filterOutKmer) - continue; + filterOutKmer = checkKmerFilter(samples, &filter->second); - m_localGramMatrix[sample1Index][sample2Index] += hits; + if(filterOutKmer) { + continue; + } - if (sample1Index != sample2Index) - m_localGramMatrix[sample2Index][sample1Index] += hits; + m_localGramMatrices[filter->first][sample1Index][sample2Index] += hits; + } #if 0 sum += hits; @@ -393,8 +416,8 @@ void StoreKeeper::printLocalGramMatrix() { cout << "[StoreKeeper] Local Gram Matrix:" << endl; cout << endl; - for(map >::iterator column = m_localGramMatrix.begin(); - column != m_localGramMatrix.end(); ++column) { + for(map >::iterator column = m_localGramMatrices[-1].begin(); + column != m_localGramMatrices[-1].end(); ++column) { SampleIdentifier sample = column->first; @@ -403,8 +426,8 @@ void StoreKeeper::printLocalGramMatrix() { cout << endl; - for(map >::iterator row = m_localGramMatrix.begin(); - row != m_localGramMatrix.end(); ++row) { + for(map >::iterator row = m_localGramMatrices[-1].begin(); + row != m_localGramMatrices[-1].end(); ++row) { SampleIdentifier sample1 = row->first; @@ -663,15 +686,16 @@ void StoreKeeper::sendKmersSamples() { -bool StoreKeeper::checkKmerFilter (set * samples) { - +bool StoreKeeper::checkKmerFilter (set * samples, vector * sampleInputTypes) { bool skipKmer = false; vector samplesFILTERIN; - for(std::vector::iterator it = m_sampleInputTypes->begin(); it != m_sampleInputTypes->end(); ++it) { + for(std::vector::iterator it = sampleInputTypes->begin(); it != sampleInputTypes->end(); ++it) { + + // cout << "DEBUG [storekeeper checkfilter] sampleInputTypes : " << *it << endl; - int sample = distance(m_sampleInputTypes->begin(),it); + int sample = distance(sampleInputTypes->begin(),it); if(*it == INPUT_TYPE_GRAPH || *it == INPUT_TYPE_ASSEMBLY){ continue; @@ -686,7 +710,6 @@ bool StoreKeeper::checkKmerFilter (set * samples) { return true; } } - } // Look if the Kmer is not part of a FILTERIN diff --git a/code/Surveyor/StoreKeeper.h b/code/Surveyor/StoreKeeper.h index 9686f913..6c2f147a 100644 --- a/code/Surveyor/StoreKeeper.h +++ b/code/Surveyor/StoreKeeper.h @@ -50,7 +50,10 @@ class StoreKeeper: public Actor { int m_storeDataCalls; int m_receivedPushes; - map > m_localGramMatrix; + + typedef map > localGramMatrix; + map m_localGramMatrices; + map ::iterator m_iterator0; map >::iterator m_iterator1; map::iterator m_iterator2; @@ -62,6 +65,7 @@ class StoreKeeper: public Actor { int m_kmerMatrixOwner; vector * m_sampleInputTypes; + map< int, vector > * m_filterMatrices; bool m_configured; @@ -94,7 +98,7 @@ class StoreKeeper: public Actor { void sendMatrixCell(); - bool checkKmerFilter(set * samples); + bool checkKmerFilter(set * samples, vector * sampleInputTypes); public: StoreKeeper(); From 7dd2cbc593ae9c27eebdc606bf7bd93cf47187ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 5 Oct 2015 10:33:45 -0400 Subject: [PATCH 15/23] fix undesirable and random print of kmerMatrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/Mother.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/Surveyor/Mother.cpp b/code/Surveyor/Mother.cpp index a0e89e11..1fa06a25 100644 --- a/code/Surveyor/Mother.cpp +++ b/code/Surveyor/Mother.cpp @@ -348,6 +348,8 @@ void Mother::startSurveyor() { // Set matricesAreReady to true in case user doesn't want // to print out kmers matrix. m_matricesAreReady = true; + m_printKmerMatrix = false; + int filterTypes [4] = {INPUT_FILTERIN_GRAPH, INPUT_FILTEROUT_GRAPH, INPUT_FILTERIN_ASSEMBLY, INPUT_FILTEROUT_ASSEMBLY}; map fastTable; From 1ed9d2da1c4be858e5e387038ba61e67c42362c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 14 Dec 2015 16:22:26 -0500 Subject: [PATCH 16/23] add normalized similarity and distance matrices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/MatrixOwner.cpp | 184 +++++++++++++++++++++++++++++++++- code/Surveyor/MatrixOwner.h | 8 ++ 2 files changed, 189 insertions(+), 3 deletions(-) diff --git a/code/Surveyor/MatrixOwner.cpp b/code/Surveyor/MatrixOwner.cpp index c7582780..83829879 100644 --- a/code/Surveyor/MatrixOwner.cpp +++ b/code/Surveyor/MatrixOwner.cpp @@ -30,6 +30,7 @@ using namespace std; #include +#include #define INPUT_TYPE_GRAPH 0 #define INPUT_FILTERIN_GRAPH 1 @@ -194,14 +195,33 @@ void MatrixOwner::receive(Message & message) { cout << similarityMatrix << endl; } + + // normalize the similarity matrix + + normalizeMatrix(); + + ostringstream matrixFileForNormalized; + matrixFileForNormalized << m_parameters->getPrefix() << "/Surveyor/"; + matrixFileForNormalized << "SimilarityMatrix.global.normalized.tsv"; + + string normalizedMatrix = matrixFileForNormalized.str(); + ofstream normalizedFile; + normalizedFile.open(normalizedMatrix.c_str()); + printLocalGramMatrix(normalizedFile, m_normalizedSimilarityMatrix, m_sampleByFilter[-1]); + normalizedFile.close(); + + printName(); + cout << "[MatrixOwner] printed the normalized Similarity Matrix: "; + cout << normalizedMatrix << endl; + + // create DistanceMatrix computeDistanceMatrix(); ostringstream matrixFileForDistances; matrixFileForDistances << m_parameters->getPrefix() << "/Surveyor/"; - matrixFileForDistances << "DistanceMatrix.global.tsv"; - + matrixFileForDistances << "DistanceMatrix.global.euclidean_raw.tsv"; string distanceMatrix = matrixFileForDistances.str(); ofstream distanceFile; @@ -213,6 +233,20 @@ void MatrixOwner::receive(Message & message) { cout << "[MatrixOwner] printed the Distance Matrix: "; cout << distanceMatrix << endl; + ostringstream matrixFileForNormDistances; + matrixFileForNormDistances << m_parameters->getPrefix() << "/Surveyor/"; + matrixFileForNormDistances << "DistanceMatrix.global.euclidean_normalized.tsv"; + + string normDistanceMatrix = matrixFileForNormDistances.str(); + ofstream normDistanceFile; + normDistanceFile.open(normDistanceMatrix.c_str()); + printLocalGramMatrix(normDistanceFile, m_normalizedDistanceMatrix, m_sampleByFilter[-1]); + normDistanceFile.close(); + + printName(); + cout << "[MatrixOwner] printed the normalized Distance Matrix: "; + cout << normDistanceMatrix << endl; + // tell Mother that the matrix is ready now. Message coolMessage; @@ -231,6 +265,7 @@ void MatrixOwner::receive(Message & message) { // TODO: save time by only computing the lower triangle. void MatrixOwner::computeDistanceMatrix() { + // raw matrix for(map >::iterator row = m_gramMatrices[-1].begin(); row != m_gramMatrices[-1].end(); ++row) { @@ -241,7 +276,7 @@ void MatrixOwner::computeDistanceMatrix() { SampleIdentifier sample2 = cell->first; - // d(x, x') = sqrt( k(x,x) + k(x', x') - 2 k (x, x')) + // This is not Euclidean distance .. d(x, x') = sqrt( k(x,x) + k(x', x') - 2 k (x, x')) LargeCount distance = 0; distance += m_gramMatrices[-1][sample1][sample1]; distance += m_gramMatrices[-1][sample2][sample2]; @@ -255,8 +290,99 @@ void MatrixOwner::computeDistanceMatrix() { } } + + + // normalized matrix + // for(map >::iterator row = m_normalizedSimilarityMatrix.begin(); + // row != m_normalizedSimilarityMatrix.end(); ++row) { + + // SampleIdentifier sample1 = row->first; + + // for(map::iterator cell = row->second.begin(); + // cell != row->second.end(); ++cell) { + + // SampleIdentifier sample2 = cell->first; + + + // // This is not Euclidean distance .. d(x, x') = sqrt( k(x,x) + k(x', x') - 2 k (x, x')) + // double distance = 0; + // distance += m_normalizedSimilarityMatrix[sample1][sample1]; + // distance += m_normalizedSimilarityMatrix[sample2][sample2]; + // distance -= 2 * m_normalizedSimilarityMatrix[sample1][sample2]; + + // distance = (double) sqrt((double)distance); + + // m_normalizedDistanceMatrix[sample1][sample2] = distance; + // } + + // } + + + for(map >::iterator row = m_normalizedSimilarityMatrix.begin(); + row != m_normalizedSimilarityMatrix.end(); ++row) { + + SampleIdentifier sample1 = row->first; + double v1[row->second.size()]; + + int x = 0; + for(map::iterator cell = row->second.begin(); cell != row->second.end(); ++cell) { + v1[x] = double(cell->second); + x += 1; + } + + // SampleIdentifier sample2 = cell->first; + + for(map >::iterator row2 = row; + row2 != m_normalizedSimilarityMatrix.end(); ++row2) { + + SampleIdentifier sample2 = row2->first; + double v2[row->second.size()]; + + int y = 0; + for(map::iterator cell2 = row2->second.begin(); cell2 != row2->second.end(); ++cell2) { + v2[y] = double(cell2->second); + y += 1; + } + + double inner_product = 0; + for(unsigned int i = 0; i != row->second.size(); i++) { + double diff = (v1[i]-v2[i]); + inner_product += (diff*diff); + } + + m_normalizedDistanceMatrix[sample1][sample2] = sqrt(inner_product); + m_normalizedDistanceMatrix[sample2][sample1] = sqrt(inner_product); + } + + } + } + +void MatrixOwner::normalizeMatrix() { + + + for(map >::iterator row = m_gramMatrices[-1].begin(); + row != m_gramMatrices[-1].end(); ++row) { + + SampleIdentifier sample1 = row->first; + + for(map::iterator cell = row->second.begin(); + cell != row->second.end(); ++cell) { + + SampleIdentifier sample2 = cell->first; + + double count = (double)(double(m_gramMatrices[-1][sample1][sample2]+1)/sqrt(double(m_gramMatrices[-1][sample1][sample1]+1)*double(m_gramMatrices[-1][sample2][sample2]+1))); + + m_normalizedSimilarityMatrix[sample1][sample2] = count; + } + + } + +} + + + void MatrixOwner::printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude) { int numberOfSamples = m_sampleNames->size(); @@ -304,6 +430,58 @@ void MatrixOwner::printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude) { + + int numberOfSamples = m_sampleNames->size(); + + for(int i = 0 ; i < numberOfSamples ; ++i) { + + if (samplesToInclude[i] == 0) + continue; + + string & sampleName1 = m_sampleNames->at(i); + + stream << " " << sampleName1; + } + + stream << endl; + + + for(int i = 0 ; i < numberOfSamples ; ++i) { + + if (samplesToInclude[i] == 0) + continue; + + string & sampleName1 = m_sampleNames->at(i); + + stream << sampleName1; + + for(int j = 0 ; j < numberOfSamples ; ++j) { + + if (samplesToInclude[j] == 0) + continue; + + //string & sampleName2 = m_sampleNames->at(j); + + double hits = 0; + + if(matrix.count(i) > 0 && matrix[i].count(j) > 0) { + + hits = matrix[i][j]; + } + + stream << " " << hits; + } + + stream << endl; + } +} + + + + /** * Write it in RaySurveyorResults/SurveyorMatrix.tsv * Also write a distance matrix too ! diff --git a/code/Surveyor/MatrixOwner.h b/code/Surveyor/MatrixOwner.h index 82a4e9da..5ec6255e 100644 --- a/code/Surveyor/MatrixOwner.h +++ b/code/Surveyor/MatrixOwner.h @@ -39,8 +39,13 @@ class MatrixOwner : public Actor { Parameters * m_parameters; vector * m_sampleNames; + // Distance matrix map > m_kernelDistanceMatrix; + // Normalized matrices + map > m_normalizedSimilarityMatrix; + map > m_normalizedDistanceMatrix; + typedef map > localGramMatrix; map m_gramMatrices; @@ -51,9 +56,12 @@ class MatrixOwner : public Actor { int m_completedStoreActors; void printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude); + void printLocalGramMatrix(ostream & stream, map > & matrix, vector & samplesToInclude); + void printLocalGramMatrixWithHash(ostream & stream, map > & matrix); void computeDistanceMatrix(); + void normalizeMatrix(); public: From a502b0531593a1c96808648089b7dc57113d9474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20D=C3=A9raspe?= Date: Mon, 14 Dec 2015 16:57:37 -0500 Subject: [PATCH 17/23] edit rm old comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maxime Déraspe --- code/Surveyor/MatrixOwner.cpp | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/code/Surveyor/MatrixOwner.cpp b/code/Surveyor/MatrixOwner.cpp index 83829879..365816a8 100644 --- a/code/Surveyor/MatrixOwner.cpp +++ b/code/Surveyor/MatrixOwner.cpp @@ -293,31 +293,6 @@ void MatrixOwner::computeDistanceMatrix() { // normalized matrix - // for(map >::iterator row = m_normalizedSimilarityMatrix.begin(); - // row != m_normalizedSimilarityMatrix.end(); ++row) { - - // SampleIdentifier sample1 = row->first; - - // for(map::iterator cell = row->second.begin(); - // cell != row->second.end(); ++cell) { - - // SampleIdentifier sample2 = cell->first; - - - // // This is not Euclidean distance .. d(x, x') = sqrt( k(x,x) + k(x', x') - 2 k (x, x')) - // double distance = 0; - // distance += m_normalizedSimilarityMatrix[sample1][sample1]; - // distance += m_normalizedSimilarityMatrix[sample2][sample2]; - // distance -= 2 * m_normalizedSimilarityMatrix[sample1][sample2]; - - // distance = (double) sqrt((double)distance); - - // m_normalizedDistanceMatrix[sample1][sample2] = distance; - // } - - // } - - for(map >::iterator row = m_normalizedSimilarityMatrix.begin(); row != m_normalizedSimilarityMatrix.end(); ++row) { @@ -330,7 +305,6 @@ void MatrixOwner::computeDistanceMatrix() { x += 1; } - // SampleIdentifier sample2 = cell->first; for(map >::iterator row2 = row; row2 != m_normalizedSimilarityMatrix.end(); ++row2) { From 5099a8e3f9bfd6e1a6053539b7d324c72ed2c512 Mon Sep 17 00:00:00 2001 From: zorino Date: Tue, 29 Nov 2016 14:37:26 -0500 Subject: [PATCH 18/23] add Surveyor scripts Signed-off-by: zorino --- scripts/Surveyor/pyvolve-gen-seq.py | 120 ++++++++++++++++ scripts/Surveyor/raysurveyor-conf.py | 39 ++++++ scripts/Surveyor/raysurveyor-gentree.py | 176 ++++++++++++++++++++++++ scripts/Surveyor/requirements.txt | 10 ++ scripts/Surveyor/treeclust-compare.py | 143 +++++++++++++++++++ 5 files changed, 488 insertions(+) create mode 100755 scripts/Surveyor/pyvolve-gen-seq.py create mode 100755 scripts/Surveyor/raysurveyor-conf.py create mode 100755 scripts/Surveyor/raysurveyor-gentree.py create mode 100644 scripts/Surveyor/requirements.txt create mode 100755 scripts/Surveyor/treeclust-compare.py diff --git a/scripts/Surveyor/pyvolve-gen-seq.py b/scripts/Surveyor/pyvolve-gen-seq.py new file mode 100755 index 00000000..0d0b0b69 --- /dev/null +++ b/scripts/Surveyor/pyvolve-gen-seq.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# author: maxime déraspe +# email: maxime@deraspe.net +# date: 2016-10-05 +# version: 0.01 + + +import sys +import os +from subprocess import call +import pyvolve +import pyani +from ete3 import Tree +from Bio import SeqIO + + +def split_seq(seq_f,outdir): + os.mkdir(outdir) if not os.path.exists(outdir) else None + handle = open(seq_f, "r") + for record in SeqIO.parse(handle, "fasta"): + output_handle = open(outdir+"/"+record.id.replace("/","_")+".fasta", "w") + SeqIO.write(record,output_handle,"fasta") + output_handle.close() + handle.close() + +def pyani_seq(seq_f): + outdir = seq_f.replace(".fasta","") + print(outdir) + split_seq(seq_f,outdir) + call("average_nucleotide_identity.py --workers 4 -i %s -o %s -m ANIb" % (outdir, outdir+".ANI"), shell=True) + +def tree_distances_info(file,scale,seq_len): + + t = Tree(file) + # branch_len_matrix_f = file + ".branches-len.tsv" + branch_len_out = open(file + ".%d.patristic-dist.tsv" % seq_len, "w") + tree_info = open(file + ".%d.tree-info.txt" % seq_len, "w") + avg_distance_leaves = 0 + + # Computing patristic distance matrix + header = "" + all_leaves = t.get_leaves() + for i in all_leaves: + header = header + "\t" + i.name + + nb_of_distances = 0 + max_len = 0 + min_len = 99999999999999 + branch_len_out.write(header+"\n") + for leaf1 in all_leaves: + row = "" + row += str(leaf1.name) + for leaf2 in all_leaves: + avg_distance_leaves += leaf1.get_distance(leaf2) + distance = leaf1.get_distance(leaf2) + row += "\t%f" % distance + nb_of_distances += 1 + if distance > max_len: + max_len = distance + if distance < min_len and distance > 0: + min_len = distance + + branch_len_out.write(row+"\n") + + tree_info.write("Scale_factor(1=original-tree)\t%f\n" % scale) + tree_info.write("Seq_Length\t%d\n" % seq_len) + tree_info.write("Number_of_leaves_(taxa)\t%d\n" % len(all_leaves)) + tree_info.write("Minimal_patristic_distance\t%f\n" % min_len) + tree_info.write("Maximal_patristic_distance\t%f\n" % max_len) + tree_info.write("Average_patristic_distance\t%f\n" % (avg_distance_leaves/(nb_of_distances*scale))) + + print("Scale_factor(1=original-tree)\t%f" % scale) + print("Seq_Length\t%d" % seq_len) + print("Number_of_leaves_(taxa)\t%d" % len(all_leaves)) + print("Minimal_patristic_distance\t%f" % min_len) + print("Maximal_patristic_distance\t%f" % max_len) + print("Average_patristic_distance\t%f" % (avg_distance_leaves/(nb_of_distances*scale))) + + branch_len_out.close() + tree_info.close() + +def ray_surveyor_config(): + + print("TODO") + + +# Main # +if __name__ == "__main__": + + usage =''' + python pyvolve-gen-seq.py [ default=1 (no scale)] + ''' + if len(sys.argv) < 3: + sys.exit(usage) + + tree_f = sys.argv[1] + outfiles = tree_f + size = sys.argv[2] + scale = 1 + scale = float(sys.argv[3]) if len(sys.argv) > 3 else None + + print("Reading tree..") + my_tree = pyvolve.read_tree(file = tree_f, scale_tree=scale) + my_model = pyvolve.Model("nucleotide") + my_partition = pyvolve.Partition(models = my_model, size = int(size)) + + print("Simulating sequences..") + my_evolver = pyvolve.Evolver(tree = my_tree, partitions = my_partition) + my_evolver(ratefile = "%s.%s.ratefile.txt" % (outfiles, size), + infofile = "%s.%s.infofile.txt" % (outfiles, size), + seqfile = "%s.%s.seqfile.fasta" % (outfiles, size) ) + + print("Tree info..") + tree_distances_info(tree_f, scale, int(size)) + + print("Running ANI on sequences..") + pyani_seq("%s.%s.seqfile.fasta" % (outfiles, size)) + + diff --git a/scripts/Surveyor/raysurveyor-conf.py b/scripts/Surveyor/raysurveyor-conf.py new file mode 100755 index 00000000..3244119b --- /dev/null +++ b/scripts/Surveyor/raysurveyor-conf.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# author: maxime déraspe +# email: maxime@deraspe.net +# date: 2016-10-16 +# version: 0.01 + + +import sys +import os + +# Main # +if __name__ == "__main__": + + usage = ''' + usage : raysurveyor-conf.py + ''' + + if len(sys.argv) < 3: + print(usage) + sys.exit() + + dir = sys.argv[1] + if dir[-1] == "/": + # print("replacing trailing") + dir = dir[0:-2] + + klen = sys.argv[2] + outdir = dir+".survey.k"+klen + + with open(outdir+".conf","w") as fout: + fout.write("-k "+klen+"\n") + fout.write("-o "+outdir+"\n") + fout.write("-run-surveyor"+"\n") + for f in os.listdir(dir): + fout.write("-read-sample-assembly %s %s" % + (f.replace(".fasta",""), + dir+"/"+f+"\n")) + diff --git a/scripts/Surveyor/raysurveyor-gentree.py b/scripts/Surveyor/raysurveyor-gentree.py new file mode 100755 index 00000000..226d4f4c --- /dev/null +++ b/scripts/Surveyor/raysurveyor-gentree.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# author: maxime déraspe +# email: maxime@deraspe.net +# date: 2016-10-08 +# version: 0.01 + +import sys +import argparse +import pandas as pd +import numpy as np + +from io import StringIO +from Bio import Phylo +from Bio.Phylo import PhyloXML +from Bio.Phylo.TreeConstruction import _DistanceMatrix, _Matrix, DistanceTreeConstructor +from ete3 import Tree, Phyloxml, phyloxml + +from scipy.spatial.distance import pdist, squareform + +from skbio import DistanceMatrix +from skbio.tree import nj + + +def read_args(): + + parser = argparse.ArgumentParser(description='Create Phylogenetic tree from similarity (Gramian) matrix', add_help=False) + + mandatory_group = parser.add_argument_group(title='Options') + mandatory_group.add_argument('-i', '--input', metavar='matrix.tsv', + help='Similarity matrix file (Gramian matrix) - Ray-Suveyor output') + mandatory_group.add_argument('-o', '--output', metavar='tree.nwk', default='tree.nwk', help='Output tree name') + mandatory_group.add_argument('--dist', default='euclidean', + help='Distance metric [euclidean, cosine, minkowski, .. see https://goo.gl/zg2584') + mandatory_group.add_argument('--tree', default='nj', + help='Clustering algorithm to build the tree: nj (neighbor joining) or upgma (rooted - not recommanded) ') + mandatory_group.add_argument('--norm', help='Normalized the similarity matrix first', action='store_true') + mandatory_group.add_argument('--format', default='newick', help='Output Tree Format [newick or phyloxml]') + mandatory_group.add_argument('-h', '--help', help='help message', action='store_true') + + args = vars(parser.parse_args()) + + if (args['input'] == None) or (args['help']): + parser.print_help() + sys.exit() + + return args + + +def read_matrix(file): + pd_frame = pd.read_table(file,sep='\t', skipinitialspace=True, index_col=0) + return pd_frame + + +def normalize_gram_matrix(K): + normX = np.sqrt(K.diagonal()) + return ((K/normX).T/normX).T + +def condense_matrix(df_dist): + print("Creating lower triangle of distance matrix..") + dist_matrix = [] + i = 0 + for index, row in df_dist.iterrows(): + vec = [] + i = i+1 + for j in range(0,i): + vec.append(float(row[j])) + dist_matrix.append(vec) + + +def build_tree(dist_matrix, names_list, clust): + + tree = None + if clust == 'nj': + # print(dist_matrix) + dm = DistanceMatrix(dist_matrix, names_list) + tree_scikit = nj(dm,result_constructor=str) + tree = Tree(tree_scikit) + elif clust == 'upgma': + dm = _DistanceMatrix(names=names_list, matrix=dist_matrix) + constructor = DistanceTreeConstructor() + tree_biopython = constructor.upgma(condense_matrix(dm)) + # # remove InnerNode names + # for i in tree_biopython.get_nonterminals(): + # i.name = None + tree = Tree(tree_biopython) + else: + print("Unknown tree clustering method ! Aborting") + sys.exit() + + return tree + +def tree_distances(file): + + t = Tree(file) + branch_len_out = open(file + ".patristic-dist.tsv", "w") + avg_distance_leaves = 0 + + # Computing patristic distance matrix + header = "" + all_leaves = t.get_leaves() + for i in all_leaves: + header = header + "\t" + i.name + + nb_of_distances = 0 + max_len = 0 + min_len = 9999999999999999 + branch_len_out.write(header+"\n") + for leaf1 in all_leaves: + row = "" + row += str(leaf1.name) + for leaf2 in all_leaves: + distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999) + avg_distance_leaves += distance + row += "\t%f" % distance + nb_of_distances += 1 + if distance > max_len: + max_len = distance + if distance < min_len and distance > 0: + min_len = distance + + branch_len_out.write(row+"\n") + + branch_len_out.close() + + +# Main # +if __name__ == "__main__": + + args = read_args() + outfile = "" + outfile = args['output'] + if outfile == "tree.nwk" and args['format'] == 'phyloxml': + outfile = outfile.replace(".nwk",".xml") + + # read from similarity matrix file + print("Reading matrix file..") + df = read_matrix(args['input']) + + print("Computing the distance matrix from gram matrix with [%s] distance metric and Normalized=[%s].." + % (args['dist'], args['norm'])) + mat_dist = None + del_minus_fn = lambda *arr: np.clip(*arr,0,99999999999999999999999999) + + if args['norm']: + norm_dist = normalize_gram_matrix(df.as_matrix()) + mat_dist = squareform(pdist(norm_dist, args['dist'])) + # mat_dist = del_minus_fn(squareform(pdist(norm_dist, args['dist']))) + else: + mat_dist = squareform(pdist(df, args['dist'])) + # mat_dist = del_minus_fn(squareform(pdist(df, args['dist']))) + + df_dist = pd.DataFrame(mat_dist, index=df.index, columns=df.columns) + outfile = args['output'] + df_dist.to_csv(path_or_buf=outfile+".dist.tsv", sep="\t") + + print("Building distance matrix for tree construction..") + names_list = [str(i) for i in df.index.values.tolist()] + + print("Building final [%s] tree.." % args['tree']) + # tree = build_tree(dist_matrix, names_list, args['tree']) + tree = build_tree(df_dist, names_list, args['tree']) + + print("Printing tree..") + if args['format'] == 'phyloxml': + treedata = tree.write() + handle = StringIO(treedata) + tree_out = Phylo.read(handle, "newick") + phyloxml_out = tree_out.as_phyloxml() + Phylo.write(phyloxml_out, args['output'], 'phyloxml') + else: + tree.write(format=0, outfile=args['output']) + # Phylo.write(tree, args['output'], 'newick') + + tree_distances(args['output']) + print("Finish output files : \n %s\n %s\n %s" % (outfile+".dist.tsv", outfile+".patristic-dist.tsv", outfile)) diff --git a/scripts/Surveyor/requirements.txt b/scripts/Surveyor/requirements.txt new file mode 100644 index 00000000..cff80ee0 --- /dev/null +++ b/scripts/Surveyor/requirements.txt @@ -0,0 +1,10 @@ +biopython==1.68 +pyparsing==2.1.10 +pandas==0.19.0 +ete3==3.0.0b35 +numpy==1.11.2 +scipy==0.18.1 +pyani==0.2.1 +Pyvolve==0.8.2 +scikit-bio==0.5.0 +matplotlib==1.5.3 diff --git a/scripts/Surveyor/treeclust-compare.py b/scripts/Surveyor/treeclust-compare.py new file mode 100755 index 00000000..2f5dba51 --- /dev/null +++ b/scripts/Surveyor/treeclust-compare.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# author: maxime déraspe +# email: maxime@deraspe.net +# date: 2016-10-18 +# version: 0.01 + +# import to compare trees +import sys +import argparse +from ete3 import Tree + +# import to compare clusters +from matplotlib import pyplot as plt +from scipy.cluster.hierarchy import linkage, cophenet +from scipy.stats import pearsonr +from scipy.spatial.distance import pdist, squareform +import numpy as np +import pandas as pd + +from skbio import DistanceMatrix +from skbio.stats.distance import mantel + + +def read_args(): + + parser = argparse.ArgumentParser(description='Compare phylogenetic trees and clusters drawn from distance matrices', + add_help=False) + + mandatory_group = parser.add_argument_group(title='Options') + mandatory_group.add_argument('-t', '--type', default='trees', + help='trees or clusters (default=trees)') + mandatory_group.add_argument('-o', '--output', metavar='tree-compare.tsv', + default='tree-compare.tsv', help='Output comparison file') + mandatory_group.add_argument('-r', '--ref', metavar='ref tree/matrix', + help='Reference tree or dist matrix') + mandatory_group.add_argument('-a', '--all', metavar='trees/matrices...', action='append', + help='All other trees or dist matrices to compare with') + mandatory_group.add_argument('-h', '--help', help='help message', action='store_true') + + args = vars(parser.parse_args()) + + if (args['type'] == None or + args['ref'] == None or + args['all'] == None or + args['help']): + parser.print_help() + sys.exit() + + return args + + +def compare_trees(args): + + tree_ref = Tree(args['ref']) + trees = [] + for i in args['all']: + tree = {} + tree['name'] = i + tree['comparison'] = tree_ref.compare(Tree(i),unrooted=True) + trees.append(tree) + + with open(args['output'],"w") as f_out: + f_out.write("Tree\tEffective_Tree_Size\tRF-normalized\tRF\tRF-max\t% edges in Src\t% edges in Ref\n") + for tree in trees: + f_out.write(tree['name']+"\t") + f_out.write(str(tree['comparison']['effective_tree_size'])+"\t") + f_out.write(str(tree['comparison']['norm_rf'])+"\t") + f_out.write(str(tree['comparison']['rf'])+"\t") + f_out.write(str(tree['comparison']['max_rf'])+"\t") + f_out.write(str(tree['comparison']['ref_edges_in_source'])+"\t") + f_out.write(str(tree['comparison']['source_edges_in_ref'])+"\t") + f_out.write("\n") + + + # print(trees[0]['comparison']) + + +def check_symmetry(m): + # will also correct different number + for i in range(0,len(m)-1): + for j in range(0,len(m)-1): + try: + val = float(m[i][j]) + val2 = float(m[j][i]) + except ValueError: + print("Not an number!") + if(m[i][j] != m[j][i]): + m[j][i] = m[i][j] + + +def compare_clusters(args): + + ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() + check_symmetry(ref_df) + linkage_ref = linkage(ref_df, 'ward') + c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) + + outfile = open(args['output'],"w") + outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n") + + for i in args['all']: + fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix() + check_symmetry(fst_df) + mantel_coeff = 0.0 + p_value_mantel = 0.0 + cophenetic_pearson = 0.0 + p_value_cophenetic = 0.0 + n = 0 + try: + mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) + linkage_fst = linkage(fst_df, 'ward') + c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) + cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst) + except Exception as e: + print("Error : %s" % str(e)) + mantel_coeff = "Failed" + p_value_manel = "Failed" + cophenetic_pearson = "Failed" + p_value_cophenetic = "Failed" + + outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n") + + outfile.close() + + +# Main # +if __name__ == "__main__": + + args = read_args() + + if args['type'] == "trees": + compare_trees(args) + elif args['type'] == "clusters": + compare_clusters(args) + else: + print("Bad Type ! .. trees or clusters") + sys.exit() + + # print("Compare tree") + # print(tree_ref.compare(trees[0],unrooted=True)) + # # print(trees[0].name) + From 3bdd5f7d350bb859577ad1304dc595f8ef07c548 Mon Sep 17 00:00:00 2001 From: zorino Date: Mon, 5 Dec 2016 13:58:36 -0500 Subject: [PATCH 19/23] fix bug in upgma gentree Signed-off-by: zorino --- scripts/Surveyor/raysurveyor-gentree.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/Surveyor/raysurveyor-gentree.py b/scripts/Surveyor/raysurveyor-gentree.py index 226d4f4c..35b119e2 100755 --- a/scripts/Surveyor/raysurveyor-gentree.py +++ b/scripts/Surveyor/raysurveyor-gentree.py @@ -67,6 +67,7 @@ def condense_matrix(df_dist): vec.append(float(row[j])) dist_matrix.append(vec) + return dist_matrix def build_tree(dist_matrix, names_list, clust): @@ -77,13 +78,15 @@ def build_tree(dist_matrix, names_list, clust): tree_scikit = nj(dm,result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': - dm = _DistanceMatrix(names=names_list, matrix=dist_matrix) + dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() - tree_biopython = constructor.upgma(condense_matrix(dm)) - # # remove InnerNode names - # for i in tree_biopython.get_nonterminals(): - # i.name = None - tree = Tree(tree_biopython) + tree_biopython = constructor.upgma(dm) + # remove InnerNode names + for i in tree_biopython.get_nonterminals(): + i.name = None + output = StringIO() + Phylo.write(tree_biopython,output, "newick") + tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() From 725041cce17096f837bb68f15855b30837051ef7 Mon Sep 17 00:00:00 2001 From: zorino Date: Thu, 15 Dec 2016 11:11:32 -0500 Subject: [PATCH 20/23] add support to normalize matrix with another matrix norms Signed-off-by: zorino --- scripts/Surveyor/raysurveyor-gentree.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/scripts/Surveyor/raysurveyor-gentree.py b/scripts/Surveyor/raysurveyor-gentree.py index 35b119e2..4c17b9cd 100755 --- a/scripts/Surveyor/raysurveyor-gentree.py +++ b/scripts/Surveyor/raysurveyor-gentree.py @@ -33,8 +33,9 @@ def read_args(): mandatory_group.add_argument('--dist', default='euclidean', help='Distance metric [euclidean, cosine, minkowski, .. see https://goo.gl/zg2584') mandatory_group.add_argument('--tree', default='nj', - help='Clustering algorithm to build the tree: nj (neighbor joining) or upgma (rooted - not recommanded) ') + help='Clustering algorithm to build the tree: nj (neighbor joining) or upgma (rooted - not recommended)') mandatory_group.add_argument('--norm', help='Normalized the similarity matrix first', action='store_true') + mandatory_group.add_argument('--norms-matrix', help='Normalized with the norms of this matrix') mandatory_group.add_argument('--format', default='newick', help='Output Tree Format [newick or phyloxml]') mandatory_group.add_argument('-h', '--help', help='help message', action='store_true') @@ -51,9 +52,12 @@ def read_matrix(file): pd_frame = pd.read_table(file,sep='\t', skipinitialspace=True, index_col=0) return pd_frame - -def normalize_gram_matrix(K): - normX = np.sqrt(K.diagonal()) +def normalize_gram_matrix(K,K2=None): + normX = None + if K2 is not None: + normX = np.sqrt(K2.diagonal()) + else: + normX = np.sqrt(K.diagonal()) return ((K/normX).T/normX).T def condense_matrix(df_dist): @@ -146,7 +150,13 @@ def tree_distances(file): del_minus_fn = lambda *arr: np.clip(*arr,0,99999999999999999999999999) if args['norm']: - norm_dist = normalize_gram_matrix(df.as_matrix()) + norm_dist = None + if args['norms_matrix']: + norm_dist = normalize_gram_matrix(df.as_matrix(), read_matrix(args['norms_matrix']).as_matrix()) + else: + norm_dist = normalize_gram_matrix(df.as_matrix()) + # norm_matrix_out = pd.DataFrame(norm_dist, index=df.index, columns=df.columns) + # norm_matrix_out.to_csv(path_or_buf=outfile+".normalizedout.tsv", sep="\t") mat_dist = squareform(pdist(norm_dist, args['dist'])) # mat_dist = del_minus_fn(squareform(pdist(norm_dist, args['dist']))) else: From c276f3560eb764b62d6ad0108f0125dbc304dc5a Mon Sep 17 00:00:00 2001 From: zorino Date: Thu, 15 Dec 2016 11:12:27 -0500 Subject: [PATCH 21/23] add matrix-transform script Signed-off-by: zorino --- scripts/Surveyor/matrix-transform.py | 93 ++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100755 scripts/Surveyor/matrix-transform.py diff --git a/scripts/Surveyor/matrix-transform.py b/scripts/Surveyor/matrix-transform.py new file mode 100755 index 00000000..5b41341c --- /dev/null +++ b/scripts/Surveyor/matrix-transform.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# author: maxime déraspe +# email: maxime@deraspe.net +# date: 2016-12-15 +# version: 0.01 + +import sys +import pandas as pd +import numpy as np +import argparse + + +def read_args(): + + parser = argparse.ArgumentParser(description='Make transformation on the Gramian matrix', add_help=False) + + mandatory_group = parser.add_argument_group(title='Options') + mandatory_group.add_argument('-i', '--input', metavar='matrix.tsv', + help='Similarity matrix file (Gramian matrix) - Ray-Suveyor output') + mandatory_group.add_argument('-o', '--output', metavar='matrix.transformed.tsv', + default='matrix.transformed.tsv', help='Output matrix name') + mandatory_group.add_argument('-n', '--normalize', help='Normalize the similarity matrix', action='store_true') + mandatory_group.add_argument('-m', '--norms-matrix', metavar='norms-matrix.tsv', help='Normalize the similarity matrix') + mandatory_group.add_argument('-d', '--drop-indices', metavar='0,1,2', help='Drop indices list separated by coma e.g. 0,1,2') + mandatory_group.add_argument('-h', '--help', help='help message', action='store_true') + + args = vars(parser.parse_args()) + + if (args['input'] == None) or (args['help']): + parser.print_help() + sys.exit() + + return args + +def read_matrix(file): + pd_frame = pd.read_table(file,sep='\t', skipinitialspace=True, index_col=0) + return pd_frame + +def drop_indices(matrix, indices): + new_matrix = matrix.drop(matrix.columns[indices], axis=1) + new_matrix = new_matrix.drop(matrix.index[indices], axis=0) + return new_matrix + + +def normalize_gram_matrix(K,K2=None): + normX = None + if K2 is not None: + normX = np.sqrt(K2.diagonal()) + else: + normX = np.sqrt(K.diagonal()) + return ((K/normX).T/normX).T + + +# Main # +if __name__ == "__main__": + + args = read_args() + + outfile = "" + if args['output']: + outfile = args['output'] + else: + outfile = args['input'].replace(".tsv","") + + matrix = read_matrix(args['input']) + + indices = [] + if args['drop_indices']: + indices = [int(x) for x in args['drop_indices'].split(",")] + new_matrix = drop_indices(matrix, indices) + else: + new_matrix = matrix + + if args['normalize']: + if args['norms_matrix']: + if args['drop_indices']: + norms_matrix = drop_indices(read_matrix(args['norms_matrix']),indices) + else: + norms_matrix = read_matrix(args['norms_matrix']) + norm_matrix = normalize_gram_matrix(new_matrix.as_matrix(),norms_matrix.as_matrix()) + else: + norm_matrix = normalize_gram_matrix(new_matrix.as_matrix()) + else: + norm_matrix = new_matrix + + # matrix_new = matrix.drop(matrix.columns[[0]], axis=1) + # # matrix.drop(matrix.rows[[0,1]], axis=0) + # matrix_new = matrix_new[1:] + + norm_matrix_out = pd.DataFrame(norm_matrix, index=new_matrix.index, columns=new_matrix.columns) + norm_matrix_out.to_csv(path_or_buf=outfile, sep="\t") + From 6702e827b4b474860bfebf813c85ae59f4f5b481 Mon Sep 17 00:00:00 2001 From: zorino Date: Wed, 21 Dec 2016 14:29:28 -0500 Subject: [PATCH 22/23] edit treeclust hierarchical clustering method Signed-off-by: zorino --- scripts/Surveyor/treeclust-compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Surveyor/treeclust-compare.py b/scripts/Surveyor/treeclust-compare.py index 2f5dba51..de1df888 100755 --- a/scripts/Surveyor/treeclust-compare.py +++ b/scripts/Surveyor/treeclust-compare.py @@ -93,7 +93,7 @@ def compare_clusters(args): ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(ref_df) - linkage_ref = linkage(ref_df, 'ward') + linkage_ref = linkage(ref_df, 'average') c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) outfile = open(args['output'],"w") @@ -109,7 +109,7 @@ def compare_clusters(args): n = 0 try: mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) - linkage_fst = linkage(fst_df, 'ward') + linkage_fst = linkage(fst_df, 'average') c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst) except Exception as e: From e8eda44e421539a67d676e3815d940e9812f789b Mon Sep 17 00:00:00 2001 From: zorino Date: Thu, 5 Jan 2017 15:33:30 -0500 Subject: [PATCH 23/23] edit matrix-transform.py script Signed-off-by: zorino --- scripts/Surveyor/matrix-transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/Surveyor/matrix-transform.py b/scripts/Surveyor/matrix-transform.py index 5b41341c..353b1992 100755 --- a/scripts/Surveyor/matrix-transform.py +++ b/scripts/Surveyor/matrix-transform.py @@ -21,7 +21,8 @@ def read_args(): mandatory_group.add_argument('-o', '--output', metavar='matrix.transformed.tsv', default='matrix.transformed.tsv', help='Output matrix name') mandatory_group.add_argument('-n', '--normalize', help='Normalize the similarity matrix', action='store_true') - mandatory_group.add_argument('-m', '--norms-matrix', metavar='norms-matrix.tsv', help='Normalize the similarity matrix') + mandatory_group.add_argument('-m', '--norms-matrix', metavar='norms-matrix.tsv', + help='Use the diagonal of this matrix to normalize the similarity matrix') mandatory_group.add_argument('-d', '--drop-indices', metavar='0,1,2', help='Drop indices list separated by coma e.g. 0,1,2') mandatory_group.add_argument('-h', '--help', help='help message', action='store_true')