Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c0d5cb8
Better error message
AlexanderDilthey Oct 30, 2019
78feac6
Add reference
AlexanderDilthey Oct 30, 2019
81696ac
Update README.md
AlexanderDilthey Nov 14, 2019
8cfdb78
Possibility of using a user-defined graph directory
Dec 2, 2019
bed4ffd
Adding information which reference file is used to runtime terminal o…
Dec 2, 2019
fbc682a
Add missing file
AlexanderDilthey Dec 26, 2019
24930ad
Bugfix when not using minimap2
AlexanderDilthey Jan 7, 2020
2f2f40e
Merge branch 'master' of https://github.com/DiltheyLab/HLA-LA
Jan 14, 2020
f7b0ade
Update from upstream master
Jan 14, 2020
fe2c0a1
revert unnecessary changes
Jan 14, 2020
d386509
Removed changes on additionalReferencesDir
Jan 16, 2020
5311e5b
Report contig strandesness
AlexanderDilthey Feb 7, 2020
03bb164
Merge pull request #36 from jafors/master
AlexanderDilthey Jul 29, 2020
72d54dc
Update findPath.pm
jafors Aug 23, 2020
d9d27a0
Update findPath.pm
jafors Aug 23, 2020
66eb9ae
Update HLA-LA.pl
jafors Aug 24, 2020
b190dbe
Add prepareGraph to HLA-LA.pl
jafors Aug 25, 2020
a55f40a
correct handling of absolute and relative paths for custom graphdir
Aug 28, 2020
2de61d2
Cleanup
jafors Sep 2, 2020
2aa3d16
fmt
jafors Sep 2, 2020
a90f270
Merge pull request #50 from jafors/master
AlexanderDilthey Oct 1, 2020
f636b62
Reference added
AlexanderDilthey Oct 1, 2020
2a56726
Add additional reference
AlexanderDilthey Sep 1, 2021
a41f3ed
remove unnecessary db:HTS module
irunonayran Jun 22, 2022
fb8ae1c
Merge pull request #75 from irunonayran/master
AlexanderDilthey Jun 23, 2022
d2a9bd0
Allow for dashes in sample names
AlexanderDilthey Jul 17, 2023
ac6fdfb
Add lib64 library dir for bamtools
AlexanderDilthey Jul 17, 2023
cdd2423
Bugfix
AlexanderDilthey Jul 17, 2023
81714dc
Added reference file
AlexanderDilthey Jul 19, 2023
c123dd5
Also allow _ in sample names
AlexanderDilthey Jul 19, 2023
00e7663
Added T2T reference
AlexanderDilthey Jul 19, 2023
92187e8
Update README.md
AlexanderDilthey Jul 19, 2023
624f6fd
Do not fail if there are no unmapped reads
AlexanderDilthey Jul 20, 2023
2afe1f8
Merge branch 'master' of https://github.com/DiltheyLab/HLA-LA
AlexanderDilthey Jul 20, 2023
c532e1e
.
AlexanderDilthey Jul 20, 2023
595fb60
Create chm13_3.txt
AlexanderDilthey Oct 10, 2023
be9e747
Update chm13_3.txt
AlexanderDilthey Oct 10, 2023
42fe1d4
Add two new reference specification files
AlexanderDilthey Dec 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions HLA-ASM.pl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ BEGIN
use Getopt::Long;
use Cwd qw/getcwd abs_path/;
use findPath;
use Bio::DB::HTS;

my $scriptPath = $FindBin::Bin;
my $includes=" -I" . join(" -I", @INC);
Expand Down Expand Up @@ -210,8 +209,9 @@ BEGIN
print DOTPLOTDATA join("\t", "region", "contigID", "refID", "refCoordinate", "contigCoordinate"), "\n";
print EDITDISTANCE join("\t", "region", "contigID", "refID", "editDistance"), "\n";
print CONTIGLENGTHS join("\t", "region", "refID", "length"), "\n";
print GENEPOSITIONS join("\t", "contigID", "gene", "definedFromRefID", "startInContig_0Based", "stopInContig_0Based", "capturedParts"), "\n";
print GENEPOSITIONS join("\t", "contigID", "contigStrand", "gene", "definedFromRefID", "startInContig_0Based_relativeToSpecifiedStrand", "stopInContig_0Based_relativeToSpecifiedStrand", "capturedParts"), "\n";

my %overlappingContig_strand_called;
my %results;
foreach my $region (sort keys %relevantRegions)
{
Expand Down Expand Up @@ -318,7 +318,6 @@ BEGIN
my %overlappingContigs;
my %overlappingContigs_idty;
my %overlappingContig_strands;
my %overlappingContig_strand_called;
my %strand_per_originalContig;
my %strand_per_haplotig;

Expand Down Expand Up @@ -479,7 +478,7 @@ BEGIN
# }
die unless(defined $strand);

$overlappingContig_strand_called{$contigID} = $strand;
$overlappingContig_strand_called{$region}{$contigID} = $strand;

my $contigSequence = $contigs_href->{$contigID};
if(defined $strand and ($strand eq '-'))
Expand Down Expand Up @@ -944,22 +943,24 @@ BEGIN


die unless (defined $found_starts_alignmentCoordinates{$k});
my $skipped_query_alignment = substr($alignments_per_refContig{$refContigID}[2], 0, $found_starts_alignmentCoordinates{$k});
$skipped_query_alignment =~ s/-//g;
# my $skipped_query_alignment = substr($alignments_per_refContig{$refContigID}[2], 0, $found_starts_alignmentCoordinates{$k});
# $skipped_query_alignment =~ s/-//g;

# die unless($found_starts_alignmentCoordinates{$k});
# my $total_alignment_till_end_plus_1 = substr($alignments_per_refContig{$refContigID}[2], 0, $found_starts_alignmentCoordinates{$k} + 1);
# $total_alignment_till_end_plus_1 =~ s/-//g;

# $extractedSequences_pos{$k} = [length($skipped_query_alignment), length($total_alignment_till_end_plus_1)];

# note to self: not quite clear what the logic here is - first we set
my $skipped_query_alignment = '';
if($found_starts_alignmentCoordinates{$k})
{
$skipped_query_alignment = substr($alignments_per_refContig{$refContigID}[2], 0, $found_starts_alignmentCoordinates{$k});
}
$skipped_query_alignment =~ s/-//g;
$skipped_query_alignment =~ s/-//g;


my $first_queryCoordinate = length($skipped_query_alignment);
my $last_queryCoordinate = $first_queryCoordinate + length($extracted_contig_alignment_noGap) - 1;
if(length($extracted_contig_alignment_noGap) == 0)
Expand Down Expand Up @@ -1008,8 +1009,8 @@ BEGIN
{
die unless(defined $components_per_gene{$gene});
print GENEPOSITIONS join("\t",
$refContigID,
$contigID,
$contigID,
$overlappingContig_strand_called{'MHC'}{$contigID},
$gene,
$refContigID,
min(@positions),
Expand Down Expand Up @@ -1266,14 +1267,15 @@ BEGIN
my $summaryFn = $working_dir . '/' . $sampleID . '/summary.txt';

open(SUMMARY, '>', $summaryFn) or die "Cannot open $summaryFn";
print SUMMARY join("\t", qw/contigID locus utilizedRefContig start stop calledGenotypes components editDistance_calledGenotypes_assembly minEditDistance_assembly_truth minEditDistance_calledGenotype_truth minEditDistance_assembly_truth_whichAlleles minEditDistance_calledGenotype_truth_whichAlleles/), "\n";
print SUMMARY join("\t", qw/contigID contigStrand locus utilizedRefContig startInContig_relativeToSpecifiedStrand stopInContig_relativeToSpecifiedStrand calledGenotypes components editDistance_calledGenotypes_assembly minEditDistance_assembly_truth minEditDistance_calledGenotype_truth minEditDistance_assembly_truth_whichAlleles minEditDistance_calledGenotype_truth_whichAlleles/), "\n";
foreach my $locus (sort {$a cmp $b} keys %results_by_locus)
{
foreach my $contigID (sort {$a cmp $b} keys %{$results_by_locus{$locus}})
{
if((not $max_report_edit_distance) or ($results_by_locus{$locus}{$contigID}[5] <= $max_report_edit_distance))
{
print SUMMARY join("\t", $contigID, $locus, @{$results_by_locus{$locus}{$contigID}}), "\n";
die unless(defined $overlappingContig_strand_called{'MHC'}{$contigID});
print SUMMARY join("\t", $contigID, $overlappingContig_strand_called{'MHC'}{$contigID}, $locus, @{$results_by_locus{$locus}{$contigID}}), "\n";
}
}
}
Expand Down
91 changes: 81 additions & 10 deletions HLA-LA.pl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

my $_BAM;
my $graph;
my $customGraphDir;
my $sampleID;
my $qsub;
my $samtools_bin;
Expand All @@ -27,11 +28,13 @@
my $moreReferencesDir;
my $extractExonkMerCounts;
my $longReads = 0;
my $prepareGraph = 0;
my $testing = 0;
my $samtools_T;
GetOptions (
'BAM:s' => \$_BAM,
'graph:s' => \$graph,
'customGraphDir:s' => \$customGraphDir,
'sampleID:s' => \$sampleID,
'qsub:s' => \$qsub,
'workingDir:s' => \$workingDir_param,
Expand All @@ -43,10 +46,44 @@
'moreReferencesDir:s' => \$moreReferencesDir,
'extractExonkMerCounts:s' => \$extractExonkMerCounts,
'longReads:s' => \$longReads,
'prepareGraph:s' => \$prepareGraph,
'testing:s' => \$testing,
'samtools_T:s' => \$samtools_T,
);

if ($prepareGraph)
{
my $full_graph_dir = $FindBin::RealBin . '/../graphs/' . $graph;
if($customGraphDir and (-e $customGraphDir))
{
my $dir = getcwd;
print "Using custom graph directory $customGraphDir\n";
$full_graph_dir = $customGraphDir . '/' . $graph;
my $is_absolute = File::Spec->file_name_is_absolute( $full_graph_dir );
unless($is_absolute)
{
$full_graph_dir = $dir . '/' . $customGraphDir . '/' . $graph;
}
}

my $MHC_PRG_2_bin = '../bin/HLA-LA';

my $previous_dir = getcwd;
chdir($this_bin_dir) or die "Cannot cd into $this_bin_dir";

die "Binary $MHC_PRG_2_bin not there!" unless(-e $MHC_PRG_2_bin);
my $command_MHC_PRG = qq($MHC_PRG_2_bin --action prepareGraph --PRG_graph_dir $full_graph_dir);

print "\nNow executing:\n$command_MHC_PRG\n";

if(system($command_MHC_PRG) != 0)
{
die "HLA-LA graph preparation not successful. Command was $command_MHC_PRG\n";
}
exit 0;
}


if ($extractExonkMerCounts)
{
warn "--extractExonkMerCounts 1: This feature is experimental and unlikely to work on your machine";
Expand Down Expand Up @@ -93,7 +130,7 @@
}
elsif($picard_sam2fastq_bin =~ /picard\.jar$/)
{
$FASTQ_extraction_command_part1 = qq($java_bin -Xmx10g -XX:-UseGCOverheadLimit -jar $picard_sam2fastq_bin SamToFastq);
$FASTQ_extraction_command_part1 = qq($java_bin -Xmx10g -XX:-UseGCOverheadLimit -jar $picard_sam2fastq_bin SamToFastq);
}
elsif($picard_sam2fastq_bin =~ /picard$/)
{
Expand Down Expand Up @@ -144,9 +181,9 @@

$working_dir = abs_path($working_dir);

unless($sampleID =~ /^\w+$/)
unless($sampleID =~ /^[\w\-_]+$/)
{
die "Please use only alphanumeric characters - \\w+ - for --sampleID";
die "Please use only alphanumeric characters - [\\w\\-_]+ - for --sampleID";
}
my $working_dir_thisSample = $working_dir . '/' . $sampleID;

Expand All @@ -172,11 +209,10 @@
die "Can't parse samtools version output" unless($samtools_version =~ /samtools ([\d\.]+)/);
$samtools_version = $1;
my $samtools_version_numeric = $samtools_version;
if($samtools_version_numeric =~ /^(\d+)\.(\d+)\.(\d+)$/)
{
$samtools_version_numeric = $1 . '.' . $2 . $3;
}
unless($samtools_version_numeric >= 1.3)
$samtools_version_numeric =~ /^(\d+)\.(\d+)/;
my $samtools_version_major = $1;
my $samtools_version_minor = $2;
unless($samtools_version_major > 1 or ($samtools_version_major == 1 and $samtools_version_minor >= 3))
{
die "I need samtools >=1.3";
}
Expand All @@ -193,6 +229,18 @@
}

my $full_graph_dir = $FindBin::RealBin . '/../graphs/' . $graph;
if($customGraphDir and (-e $customGraphDir))
{
my $dir = getcwd;
print "Using custom graph directory $customGraphDir\n";
$full_graph_dir = $customGraphDir . '/' . $graph;
my $is_absolute = File::Spec->file_name_is_absolute( $full_graph_dir );
unless($is_absolute)
{
$full_graph_dir = $dir . '/' . $customGraphDir . '/' . $graph;
}
}

my $known_references_dir = $full_graph_dir . '/knownReferences';
unless(-e $full_graph_dir)
{
Expand Down Expand Up @@ -230,6 +278,7 @@
my @files_references = glob($known_references_dir . '/*.txt');
die "No known reference files in knownReferences ($full_graph_dir)?" unless(@files_references);
my $additional_references_dir = $this_bin_dir . '/additionalReferences/' . $graph;

if(-e $additional_references_dir)
{
my @additional_files_references = glob($additional_references_dir . '/*.txt');
Expand Down Expand Up @@ -258,7 +307,7 @@
die "Incorrect header for $f ($#firstLine_fields vs $#expected_firstLine_fields)" unless($#firstLine_fields == $#expected_firstLine_fields);
for(my $i = 0; $i <= $#firstLine_fields; $i++)
{
die "Incorrect header for $f" unless($firstLine_fields[$i] eq $expected_firstLine_fields[$i]);
die "Incorrect header for $f - expect $expected_firstLine_fields[$i], got $firstLine_fields[$i]" unless($firstLine_fields[$i] eq $expected_firstLine_fields[$i]);
}

my $n_contigs = 0;
Expand Down Expand Up @@ -321,6 +370,8 @@

my $compatible_reference_file = $compatible_files[0];

print "Using $compatible_reference_file as reference file.\n";

my @refIDs_for_extraction;
foreach my $refID (@BAM_idx_contigOrder)
{
Expand Down Expand Up @@ -362,11 +413,31 @@
my $target_extraction_unmapped = $working_dir_thisSample . '/extraction_unmapped.bam';

my $extraction_command_unmapped = qq($samtools_bin view -\@ $threads_minus_1 $view_T_switch $BAM '*' | awk '{if (\$3 == "*") print \$0}' | $samtools_bin view -bo $target_extraction_unmapped -);

print "Extract unmapped reads...\n";

if(system($extraction_command_unmapped) != 0)
{
die "Extraction command $extraction_command_unmapped failed";

my $count_unmapped = qq($samtools_bin view -\@ $threads_minus_1 $view_T_switch $BAM '*' | awk '{if (\$3 == "*") print \$0}' | wc);
open(COUNT, "$count_unmapped |") or die "Cannot open pipe to count unmapped reads";
my $wc_out = <COUNT>;
chomp($wc_out);
close(COUNT);
die "Cannot parse wc output: '$wc_out'" unless($wc_out =~ /^\s*(\d+)\s+(\d+)\s+/);
my $n_unmapped_reads = $1;

if($n_unmapped_reads != 0)
{
die "Extraction command $extraction_command_unmapped failed - there are unmapped reads [$n_unmapped_reads]";
}
else
{
print "No unmapped reads, you can ignore the samtools warning if one was emitted...\n";
my $extraction_command_unmapped_generateEmptyBAM = qq($samtools_bin view -bo $target_extraction_unmapped -\@ $threads_minus_1 $view_T_switch $BAM '*');
system($extraction_command_unmapped_generateEmptyBAM) and die "Command $extraction_command_unmapped_generateEmptyBAM failed (empty BAM generation, no unmapped reads";
}

}

unlink($target_extraction) if (-e $target_extraction);
Expand Down
Loading