@@ -860,7 +860,7 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
860860 return std::tie (m1._n_sId , m1.qStart , m1.qEnd , m1.sStart , m1.sEnd , m1.qFrameShift , m1.sFrameShift ) ==
861861 std::tie (m2._n_sId , m2.qStart , m2.qEnd , m2.sStart , m2.sEnd , m2.qFrameShift , m2.sFrameShift );
862862 });
863- lH.stats .hitsDuplicate += before - record.matches .size ();
863+ lH.stats .hitsDuplicate2 += before - record.matches .size ();
864864
865865 // sort by evalue before writing
866866 record.matches .sort ([](auto const & m1, auto const & m2) { return m1.bitScore > m2.bitScore ; });
@@ -873,6 +873,14 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
873873 }
874874 lH.stats .hitsFinal += record.matches .size ();
875875
876+ /* count uniq qry-subj-pairs */
877+ lH.uniqSubjIds .clear ();
878+ lH.uniqSubjIds .reserve (record.matches .size ());
879+ for (auto const & bm : record.matches )
880+ lH.uniqSubjIds .insert (bm._n_sId );
881+
882+ lH.stats .pairs += lH.uniqSubjIds .size ();
883+
876884 // compute LCA
877885 if (lH.options .computeLCA )
878886 {
@@ -908,32 +916,25 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
908916// Function computeBlastMatch()
909917// --------------------------------------------------------------------------
910918
911- template <typename TBlastMatch, typename TLocalHolder>
912- inline void _setupAlignInfix (TBlastMatch & bm, typename TLocalHolder::TMatch const & m, TLocalHolder & lH)
919+ template <typename TLocalHolder>
920+ inline void _widenMatch (Match & m, TLocalHolder const & lH)
913921{
914- int64_t startMod = (int64_t )m.subjStart - (int64_t )m.qryStart ;
922+ // move sStart as far left as needed to cover the part of query before qryStart
923+ m.subjStart = (m.subjStart < m.qryStart ) ? 0 : m.subjStart - m.qryStart ;
915924
916- bm.qEnd = lH.transQrySeqs [m.qryId ].size ();
917- decltype (bm.qEnd ) band = _bandSize (bm.qEnd );
918- if (startMod >= 0 )
919- {
920- bm.sStart = startMod;
921- bm.qStart = 0 ;
922- }
923- else
924- {
925- bm.sStart = 0 ;
926- bm.qStart = -startMod;
927- }
928- bm.sEnd = std::min<size_t >(bm.sStart + bm.qEnd - bm.qStart + band, lH.gH .transSbjSeqs [m.subjId ].size ());
925+ /* always align full query independent of hit-region */
926+ m.qryStart = 0 ;
927+ m.qryEnd = lH.transQrySeqs [m.qryId ].size ();
929928
930- if (bm.sStart >= band)
931- bm.sStart -= band;
932- else
933- bm.sStart = 0 ;
929+ // there is no band in computation but this value extends begin and end of Subj to account for gaps
930+ uint64_t band = _bandSize (lH.transQrySeqs [m.qryId ].size ());
931+
932+ // end on subject is beginning plus full query length plus band
933+ m.subjEnd =
934+ std::min<size_t >(m.subjStart + lH.transQrySeqs [m.qryId ].size () + band, lH.gH .transSbjSeqs [m.subjId ].size ());
934935
935- seqan::assignSource (bm. alignRow0 , lH. transQrySeqs [m. qryId ] | bio::views::slice (bm. qStart , bm. qEnd ));
936- seqan::assignSource (bm. alignRow1 , lH. gH . transSbjSeqs [m. subjId ] | bio::views::slice (bm. sStart , bm. sEnd )) ;
936+ // account for band in subj start
937+ m. subjStart = (band < m. subjStart ) ? m. subjStart - band : 0 ;
937938}
938939
939940template <typename TBlastMatch, typename TLocalHolder>
@@ -1133,7 +1134,48 @@ inline void _performAlignment(TDepSetH & depSetH,
11331134}
11341135
11351136template <typename TLocalHolder>
1136- inline void iterateMatchesFullSimd (TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
1137+ inline void _widenAndPreprocessMatches (std::span<Match> & matches, TLocalHolder & lH)
1138+ {
1139+ auto before = matches.size ();
1140+
1141+ for (Match & m : matches)
1142+ _widenMatch<TLocalHolder>(m, lH);
1143+
1144+ std::ranges::sort (matches);
1145+
1146+ if (matches.size () > 1 )
1147+ {
1148+ // pairwise merge from left to right
1149+ for (auto it = matches.begin (); it < matches.end () - 1 ; ++it)
1150+ {
1151+ Match & l = *it;
1152+ Match & r = *(it + 1 );
1153+ if ((std::tie (l.qryId , l.subjId ) == std::tie (r.qryId , r.subjId )) && (l.subjEnd >= r.subjStart ))
1154+ {
1155+ l.subjEnd = r.subjEnd ;
1156+ r.subjStart = l.subjStart ;
1157+ }
1158+ }
1159+
1160+ // pairwise "swallow" from right to left
1161+ for (auto it = matches.rbegin (); it < matches.rend () - 1 ; ++it)
1162+ {
1163+ Match & r = *it;
1164+ Match & l = *(it + 1 );
1165+ if ((std::tie (r.qryId , r.subjId ) == std::tie (l.qryId , l.subjId )) && (r.subjStart < l.subjEnd ))
1166+ {
1167+ l = r;
1168+ }
1169+ }
1170+
1171+ auto [new_end, old_end] = std::ranges::unique (matches); // move non-uniq to the end
1172+ matches = std::span<Match>{matches.begin (), new_end}; // "resize" of the span
1173+ lH.stats .hitsDuplicate += (before - matches.size ());
1174+ }
1175+ }
1176+
1177+ template <typename TLocalHolder>
1178+ inline void iterateMatchesFullSimd (std::span<Match> lambdaMatches, TLocalHolder & lH, bsDirection const dir)
11371179{
11381180 using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
11391181 using TBlastMatch = typename TLocalHolder::TBlastMatch;
@@ -1143,7 +1185,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
11431185 // statistics
11441186#ifdef LAMBDA_MICRO_STATS
11451187 ++lH.stats .numQueryWithExt ;
1146- lH.stats .numExtScore += seqan::length (lH. matches );
1188+ lH.stats .numExtScore += seqan::length (lambdaMatches );
11471189
11481190 double start = sysTime ();
11491191#endif
@@ -1152,58 +1194,37 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
11521194 seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow0>::Type> depSetH;
11531195 seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow1>::Type> depSetV;
11541196
1155- // create blast matches
1197+ // pre-sort and filter
1198+ _widenAndPreprocessMatches (lambdaMatches, lH);
1199+
1200+ // create blast matches from Lambda matches
11561201 std::list<TBlastMatch> blastMatches;
1157- for (auto it = lH. matches . begin (), itEnd = lH. matches . end (); it != itEnd; ++it )
1202+ for (Match const & m : lambdaMatches )
11581203 {
1159- // In BS-mode, skip those results that have wrong orientation
1160- if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
1161- {
1162- if ((dir == bsDirection::fwd && (it->subjId % 2 )) || (dir == bsDirection::rev && !(it->subjId % 2 )))
1163- continue ;
1164- }
11651204 // create blastmatch in list without copy or move
1166- blastMatches.emplace_back (lH.qryIds [it-> qryId / TGlobalHolder::qryNumFrames],
1167- const_gH.indexFile .ids [it-> subjId / TGlobalHolder::sbjNumFrames]);
1205+ blastMatches.emplace_back (lH.qryIds [m. qryId / TGlobalHolder::qryNumFrames],
1206+ const_gH.indexFile .ids [m. subjId / TGlobalHolder::sbjNumFrames]);
11681207
11691208 TBlastMatch & bm = blastMatches.back ();
11701209
1171- bm._n_qId = it-> qryId / TGlobalHolder::qryNumFrames;
1172- bm._n_sId = it-> subjId / TGlobalHolder::sbjNumFrames;
1210+ bm._n_qId = m. qryId / TGlobalHolder::qryNumFrames;
1211+ bm._n_sId = m. subjId / TGlobalHolder::sbjNumFrames;
11731212
1174- bm.qLength = // std::ranges::size(lH.transQrySeqs[it->qryId ]);
1175- std::ranges::size (lH.qrySeqs [bm._n_qId ]);
1213+ bm.qLength = std::ranges::size (lH.qrySeqs [bm. _n_qId ]);
1214+ bm. sLength = std::ranges::size (lH.gH . indexFile . seqs [bm._n_sId ]);
11761215
1177- bm.sLength = // std::ranges::size(lH.gH.transSbjSeqs[it->subjId]);
1178- std::ranges::size (lH.gH .indexFile .seqs [bm._n_sId ]);
1216+ bm.qStart = m.qryStart ;
1217+ bm.qEnd = m.qryEnd ;
1218+ bm.sStart = m.subjStart ;
1219+ bm.sEnd = m.subjEnd ;
1220+ seqan::assignSource (bm.alignRow0 , lH.transQrySeqs [m.qryId ] | bio::views::slice (bm.qStart , bm.qEnd ));
1221+ seqan::assignSource (bm.alignRow1 , lH.gH .transSbjSeqs [m.subjId ] | bio::views::slice (bm.sStart , bm.sEnd ));
11791222
1180- _setupAlignInfix (bm, *it, lH);
1181-
1182- _setFrames (bm, *it, lH);
1223+ _setFrames (bm, m, lH);
11831224
11841225 if (lH.options .hasSTaxIds )
11851226 bm.sTaxIds = lH.gH .indexFile .sTaxIds [bm._n_sId ];
11861227 }
1187- #ifdef LAMBDA_MICRO_STATS
1188- lH.stats .timeExtend += sysTime () - start;
1189-
1190- // filter out duplicates
1191- start = sysTime ();
1192- #endif
1193- auto before = seqan::length (blastMatches);
1194- blastMatches.sort (
1195- [](auto const & l, auto const & r)
1196- {
1197- return std::tie (l._n_qId , l._n_sId , l.sStart , l.sEnd , l.qStart , l.qEnd , l.qFrameShift , l.sFrameShift ) <
1198- std::tie (r._n_qId , r._n_sId , r.sStart , r.sEnd , r.qStart , r.qEnd , r.qFrameShift , r.sFrameShift );
1199- });
1200- blastMatches.unique (
1201- [](auto const & l, auto const & r)
1202- {
1203- return std::tie (l._n_qId , l._n_sId , l.sStart , l.sEnd , l.qStart , l.qEnd , l.qFrameShift , l.sFrameShift ) ==
1204- std::tie (r._n_qId , r._n_sId , r.sStart , r.sEnd , r.qStart , r.qEnd , r.qFrameShift , r.sFrameShift );
1205- });
1206- lH.stats .hitsDuplicate += (before - seqan::length (blastMatches));
12071228
12081229 // sort by lengths to minimize padding in SIMD
12091230 blastMatches.sort (
@@ -1217,6 +1238,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
12171238
12181239 start = sysTime ();
12191240#endif
1241+
12201242 // fill batches
12211243 _setupDepSets (depSetH, depSetV, blastMatches);
12221244
@@ -1342,12 +1364,24 @@ inline void writeRecords(TLocalHolder & lH)
13421364template <typename TLocalHolder>
13431365inline void iterateMatches (TLocalHolder & lH)
13441366{
1345- iterateMatchesFullSimd (lH, bsDirection::fwd);
13461367 if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
13471368 {
1348- iterateMatchesFullSimd (lH, bsDirection::rev);
1369+ std::ranges::sort (lH.matches ,
1370+ [](Match const & l, Match const & r) {
1371+ return std::tuple<bool , Match const &>{l.subjId % 2 , l} <
1372+ std::tuple<bool , Match const &>{r.subjId % 2 , r};
1373+ });
1374+
1375+ auto it = std::ranges::find_if (lH.matches , [](Match const & m) { return m.subjId % 2 ; });
1376+
1377+ iterateMatchesFullSimd (std::span{lH.matches .begin (), it}, lH, bsDirection::fwd);
1378+ iterateMatchesFullSimd (std::span{it, lH.matches .end ()}, lH, bsDirection::rev);
13491379 lH.blastMatches .sort ([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId ; });
13501380 }
1381+ else
1382+ {
1383+ iterateMatchesFullSimd (lH.matches , lH, bsDirection::fwd);
1384+ }
13511385}
13521386
13531387// -----------------------------------------------------------------------
0 commit comments