Skip to content

Commit d07a1d6

Browse files
authored
physlr-make: update rules to reflect new features (#141)
* physlr-make: update rules to reflect new features * physlr-make: update map-paf rule prerequisite * physlr-make: update map-paf comment * physlr-make: update rule * add extend mol rules * unittest: update split-minimizers unit test * unittest: remove old split-minimizers unit test files * physlr-make: remove temp rules * physlr-make: use mol rules * physlr-make: update help message * physlr-make: update help message
1 parent 3326156 commit d07a1d6

7 files changed

Lines changed: 56 additions & 55 deletions

bin/physlr-make

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ t=16
1515
bloom_filter_size=10000000000 #10GB
1616

1717
# Molecule separation stratergy
18-
mol_strategy=distributed
18+
mol_strategy=distributed+sqcosbin
1919

2020
# Path to the Physlr project.
2121
physlr_path=$(shell dirname $$(dirname $(realpath $(MAKEFILE_LIST))))
@@ -103,13 +103,13 @@ arcs=false
103103
.PHONY: f1chr4 f1chr2R f1 fishchr25 fish physical-map scaffolds
104104
all: f1chr4 f1chr2R f1 fishchr25 fish
105105

106-
$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.path
106+
$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.path
107107
ln -sf $< $@
108108

109-
$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(ref).n10.paf.gz
109+
$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(ref).n10.paf.gz
110110
ln -sf $< $@
111111

112-
$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(draft).n10.sort.best.bed.path.fa
112+
$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(draft).n10.sort.best.bed.path.fa
113113
ln -sf $< $@
114114

115115
scaffolds:
@@ -151,6 +151,7 @@ physical-map: \
151151
$(lr).physlr.physical-map.$(ref).n10.qpos.chain.metrics.tsv
152152
endif
153153
endif
154+
154155
# Help
155156
help:
156157
@echo "Usage: ./physlr-make [COMMAND] [OPTION=VALUE]..."
@@ -184,11 +185,11 @@ help:
184185
@echo " min_component_size minimum number of barcodes in a backbone [50]."
185186
@echo " minimum_barcode_multiplicity minimum number of minimizers per barcode [10]."
186187
@echo " maximum_barcode_multiplicity maximum number of minimizers per barcode [5000]."
187-
@echo " mol_strategy molecule separation strategy [distributed]. Available options are bc, bc+k3, distributed, ext."
188+
@echo " mol_strategy molecule separation strategy [distributed+sqcosbin]. Available options are bc, bc+k3, distributed, distributed+sqcosbin."
188189
@echo " bc (biconnected componenets) is the least conservative and is only suitable for datasets with low barcode multiplicity."
189190
@echo " bc+k3 (biconnected componenets + k-3 cliques) is more conservative than bc and requires more time."
190191
@echo " distributed is a modified version of bc+k3 that is faster than bc+k3 but may be more (or even less) conservative."
191-
@echo " ext (extensive) mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative."
192+
@echo " distributed+sqcosbin mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative."
192193
@echo " bloom_filter_size size of bloom filter [10000000000] (10G)."
193194
@echo " arcs Use ARCS to augment scaffolds (only compatible with ARCS v1.1.1) [false]."
194195
@echo ""
@@ -898,7 +899,7 @@ endif
898899

899900
# Determine overlaps and output the graph in TSV.
900901
%.physlr.overlap.tsv: %.physlr.tsv
901-
$(time) $(physlr_path)/src/physlr-overlap -t1 -n10 $< >$@
902+
$(time) $(physlr_path)/src/physlr-overlap -t$t -m10 $< >$@
902903

903904
# Determine the degree of each vertex.
904905
%.deg.tsv: %.tsv
@@ -958,7 +959,7 @@ min_path_size=200
958959
$(python) $(bin)/physlr flesh-backbone --min-component-size=$(min_component_size) -V$V $< $*.backbone.path >$@
959960

960961
# Split the minimizers to molecules
961-
%.overlap.m$m.mol.mol2-bcs.split.tsv: %.overlap.m$m.mol.mol2-bcs.tsv %.tsv
962+
%.overlap.m$m.mol.split.tsv: %.overlap.m$m.mol.tsv %.tsv
962963
$(time) $(physlr_path)/src/physlr-split-minimizers -t$t $< $*.tsv >$@
963964

964965
# Split the reads into molecules
@@ -979,7 +980,7 @@ min_path_size=200
979980
$(time) $(python) $(bin)/physlr map -V$V -n10 $^ >$@
980981

981982
# Map the draft assembly to the backbone graph and output BED.
982-
%.backbone.map-split.$(draft).n10.bed: %.backbone.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.split.tsv $(draft).k$k-w$w.physlr.tsv
983+
%.backbone.map-split.$(draft).n10.bed: %.backbone.path %.split.tsv $(draft).k$k-w$w.physlr.tsv
983984
$(time) $(python) $(bin)/physlr map --mx-type split --map-pos 10 -V$V -n10 $^ >$@
984985

985986
# Map the draft assembly to the backbone graph and output BED.
@@ -1034,6 +1035,10 @@ min_path_size=200
10341035
%.map.$(ref).n10.paf.gz: %.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.tsv $(name)/$(ref).k$k-w$w.physlr.tsv
10351036
$(time) $(python) $(bin)/physlr map-paf -V$V -n10 $^ | $(gzip) >$@
10361037

1038+
# Map the reference to the backbone graph with split minimizers and output PAF.
1039+
%.backbone.map-split.$(ref).n10.paf.gz: %.backbone.path %.split.tsv $(name)/$(ref).k$k-w$w.physlr.tsv
1040+
$(time) $(python) $(bin)/physlr map-paf --mx-type split -V$V -n10 $^ | $(gzip) >$@
1041+
10371042
# Lift over query coordinates of a PAF file from minimzer index to nucleotide coordinate.
10381043
%.qpos.paf.gz: $(name)/$(ref).k$k-w$w.physlr.tsv %.paf.gz
10391044
$(zcat) $*.paf.gz | $(time) $(python) $(bin)/physlr liftover-paf -V$V $< - | $(gzip) >$@

src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ check-physlr-molecules: all
6767
./physlr-molecules -s bc data/tiny.mol.input.tsv | diff -q - data/tiny.mol.tsv.good
6868

6969
check-physlr-split-minimizers: all
70-
./physlr-split-minimizers -t4 data/tiny.split-minimizers.mol.mol2-bcs.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good
70+
./physlr-split-minimizers -t4 data/tiny.split-minimizers.ext.mol.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.ext.mol.split.tsv.good
7171

7272
install: physlr-indexlr physlr-filter-barcodes physlr-overlap physlr-filter-bxmx physlr-makebf physlr-molecules physlr-split-minimizers
7373
install -d $(DESTDIR)$(PREFIX)/bin
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
100_55_25_0
2+
100_55_25_90 1 3 5 7
3+
100_55_25_91 2 6 8 15
4+
22_1_9_0 2 6 8
5+
543_288_92_0 1 3 5 7
6+
75_288_50_0 1 3 7
7+
92_300_57_0 2 6 15
8+
AAACACCAGAAACCTA-1_0
9+
AAACACCAGAAACCTA-1_90 1 3 5 7
10+
AAACACCAGAAACCTA-1_91 2 6 8 15
11+
AAACACCAGAAAGCTT-1_0 1 3 5 7
12+
AAACACCAGAACGACC-1_0 1 3 7
13+
AAACACCAGAACGACT-1_0 2 6 15
14+
AAACACCAGAACGCCA-1_0 2 6 8
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
U m
2+
AAACACCAGAAACCTA-1_0 1254
3+
AAACACCAGAAACCTA-1_91 1254
4+
AAACACCAGAAACCTA-1_90 1254
5+
AAACACCAGAAAGCTT-1_0 1313
6+
AAACACCAGAACGACC-1_0 1819
7+
AAACACCAGAACGACT-1_0 1819
8+
AAACACCAGAACGCCA-1_0 4173
9+
100_55_25_0 1254
10+
100_55_25_91 1254
11+
100_55_25_90 1254
12+
543_288_92_0 1313
13+
75_288_50_0 1819
14+
92_300_57_0 1819
15+
22_1_9_0 4173
16+
17+
U V m
18+
100_55_25_90 543_288_92_0 1
19+
100_55_25_90 75_288_50_0 1
20+
100_55_25_91 92_300_57_0 1
21+
100_55_25_91 22_1_9_0 1
22+
AAACACCAGAAACCTA-1_90 AAACACCAGAAAGCTT-1_0 1
23+
AAACACCAGAAACCTA-1_90 AAACACCAGAACGACC-1_0 1
24+
AAACACCAGAAACCTA-1_91 AAACACCAGAACGACT-1_0 1
25+
AAACACCAGAAACCTA-1_91 AAACACCAGAACGCCA-1_0 1

src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good

Lines changed: 0 additions & 16 deletions
This file was deleted.

src/data/tiny.split-minimizers.mol.mol2-bcs.tsv

Lines changed: 0 additions & 27 deletions
This file was deleted.

src/physlr-split-minimizers.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ findMoleculesPerBarcode(bxToMolIdx_t& bxToMolIdx, const graph_t& g)
185185
{
186186
auto vertexItRange = boost::vertices(g);
187187
for (auto vertexIt = vertexItRange.first; vertexIt != vertexItRange.second; ++vertexIt) {
188-
std::string pattern = R"((\S+)_\d+_\d+$)";
188+
std::string pattern = R"((\S+)_\d+$)";
189189
std::regex rgx(pattern);
190190
std::smatch matches;
191191

@@ -228,7 +228,7 @@ splitMinimizers(
228228
tsl::robin_set<Minimizer> neighbourMxsUnion;
229229
for (auto neighbourItr = neighbours.first; neighbourItr != neighbours.second;
230230
++neighbourItr) {
231-
std::string pattern = R"((\S+)_\d+_\d+$)";
231+
std::string pattern = R"((\S+)_\d+$)";
232232
std::regex rgx(pattern);
233233
std::smatch matches;
234234
if (std::regex_search(g[*neighbourItr].name, matches, rgx)) {

0 commit comments

Comments
 (0)