TRACE/vcf2bed.py at main · collaborativebioinformatics/TRACE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
"""
Convert structural variant VCF file to BED format with enhanced BND parsing

Authors: Michal Izydorczyk, Nicola Wong, John Adedeji, Julian Chiu, and Thomas X. Garcia
"""

import argparse
import sys
import re
import os

def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Convert structural variant VCF file to BED format with enhanced BND parsing',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Example usage:
  %(prog)s input.vcf                    # Creates input.bed in same directory
  %(prog)s input.vcf -o output.bed      # Creates output.bed
  %(prog)s /path/to/input.vcf           # Creates /path/to/input.bed
  %(prog)s input.vcf --no-BND-parse     # Skip BND ALT parsing
  %(prog)s input.vcf --include-INS      # Include INS variants in output

Output BED format:
  Column 1: Chromosome
  Column 2: Start position (0-based)
  Column 3: End position
  Column 4: Variant ID
  Column 5: REF allele (from VCF column 4)
  Column 6: ALT allele (from VCF column 5)
  Column 7: Quality score (from VCF column 6)
  Column 8: FILTER (from VCF column 7)
  Column 9: INFO (from VCF column 8)
  Column 10: FORMAT (from VCF column 9)

For BND variants with ALT parsing enabled (default):
  - Creates duplicate lines using breakpoint information from ALT column
  - ALT formats supported:
    * N]chr:pos] or N[chr:pos[
    * ]chr:pos]N or [chr:pos[N
    * REF]chr:pos] or REF[chr:pos[
    * ]chr:pos]REF or [chr:pos[REF
  - Duplicate line uses chr and pos from ALT field
        '''
    )

    parser.add_argument('input_vcf',
                        help='Input VCF file containing structural variants')

    parser.add_argument('-o', '--output',
                        help='Output BED file (default: same path as input with .bed extension)',
                        default=None)

    parser.add_argument('--no-BND-parse',
                        action='store_true',
                        help='Do not parse BND ALT column for duplicate entries (default: parse BND)')

    parser.add_argument('--include-INS',
                        action='store_true',
                        help='Include INS variants in output (default: exclude INS)')

    return parser.parse_args()

def extract_info_field(info_string, field_name):
    """Extract a specific field value from VCF INFO column"""
    pattern = f'{field_name}=([^;]+)'
    match = re.search(pattern, info_string)
    return match.group(1) if match else None

def parse_bnd_alt(alt_field):
    """
    Parse BND ALT field to extract chromosome and position
    ALT formats supported:
    - N]chr:pos] or N[chr:pos[
    - ]chr:pos]N or [chr:pos[N
    - REF]chr:pos] or REF[chr:pos[
    - ]chr:pos]REF or [chr:pos[REF
    Returns: (chromosome, position) or (None, None) if parsing fails
    """
    # Updated comprehensive patterns for all BND formats
    patterns = [
        # Pattern 1: Starts with nucleotide(s) followed by bracket notation
        # Matches: N]chr:pos], N[chr:pos[, T]chr:pos], G[chr:pos[, etc.
        r'^[ACGTN]+[\[\]]([^:]+):(\d+)[\[\]]$',

        # Pattern 2: Starts with bracket notation, ends with nucleotide(s)
        # Matches: ]chr:pos]N, [chr:pos[N, ]chr:pos]G, [chr:pos[T, etc.
        r'^[\[\]]([^:]+):(\d+)[\[\]][ACGTN]+$',

        # Pattern 3: Only bracket notation (no nucleotides on either side)
        # This might occur in some special cases
        r'^[\[\]]([^:]+):(\d+)[\[\]]$'
    ]

    for pattern in patterns:
        match = re.search(pattern, alt_field)
        if match:
            chrom = match.group(1)
            pos = int(match.group(2))
            return chrom, pos

    return None, None

def vcf_to_bed(input_file, output_file, parse_bnd=True, include_ins=False):
    """Convert VCF file to BED format with enhanced BND parsing"""

    try:
        with open(input_file, 'r') as vcf, open(output_file, 'w') as bed:
            line_count = 0
            variant_count = 0
            skipped_ins_count = 0
            parsed_bnd_count = 0
            unparsed_bnd_count = 0

            for line in vcf:
                line_count += 1

                # Skip header lines
                if line.startswith('#'):
                    continue

                # Parse VCF fields
                fields = line.strip().split('\t')
                if len(fields) < 9:
                    print(f"Warning: Line {line_count} has insufficient fields, skipping", file=sys.stderr)
                    continue

                chrom = fields[0]
                pos = int(fields[1])
                var_id = fields[2]
                ref = fields[3]
                alt = fields[4]
                qual = fields[5]
                filter_field = fields[6]
                info = fields[7]
                format_field = fields[8] if len(fields) >= 9 else ''

                # Extract SVTYPE from INFO field
                svtype = extract_info_field(info, 'SVTYPE')

                # Skip INS variants if not included
                if svtype == 'INS' and not include_ins:
                    skipped_ins_count += 1
                    continue

                # Extract END and SVLEN from INFO field
                end_str = extract_info_field(info, 'END')
                svlen_str = extract_info_field(info, 'SVLEN')

                # Determine end position
                if end_str:
                    end_pos = int(end_str)
                elif svlen_str:
                    svlen = int(svlen_str)
                    if svlen < 0:
                        # For deletions, END = POS + abs(SVLEN)
                        end_pos = pos + abs(svlen)
                    else:
                        # For insertions, END = POS (single position)
                        end_pos = pos
                else:
                    # Default: single position variant
                    end_pos = pos

                # Convert to BED format (0-based start)
                bed_start = pos - 1
                bed_end = end_pos

                # Write primary BED line
                bed.write(f"{chrom}\t{bed_start}\t{bed_end}\t{var_id}\t{ref}\t{alt}\t{qual}\t{filter_field}\t{info}\t{format_field}\n")
                variant_count += 1

                # For BND variants, optionally create duplicate line with parsed ALT information
                if svtype == 'BND' and parse_bnd:
                    bnd_chrom, bnd_pos = parse_bnd_alt(alt)
                    if bnd_chrom and bnd_pos:
                        # Handle edge case where position is 0 (telomeric regions)
                        # In BED format, position 0 becomes 0 (not -1)
                        if bnd_pos == 0:
                            bnd_start = 0
                            bnd_end = 1  # Use 1 for end position to have valid interval
                        else:
                            # Normal case: convert to 0-based
                            bnd_start = bnd_pos - 1
                            bnd_end = bnd_pos

                        bed.write(f"{bnd_chrom}\t{bnd_start}\t{bnd_end}\t{var_id}\t{ref}\t{alt}\t{qual}\t{filter_field}\t{info}\t{format_field}\n")
                        variant_count += 1
                        parsed_bnd_count += 1
                    else:
                        unparsed_bnd_count += 1
                        # Optionally log unparsed BND for debugging
                        # print(f"Warning: Could not parse BND ALT field: {var_id}\t{alt}", file=sys.stderr)

            print(f"Successfully converted {variant_count} variant lines from {input_file} to {output_file}")
            if skipped_ins_count > 0:
                print(f"Skipped {skipped_ins_count} INS variants (use --include-INS to include them)")
            if parse_bnd and (parsed_bnd_count > 0 or unparsed_bnd_count > 0):
                print(f"BND parsing: {parsed_bnd_count} parsed, {unparsed_bnd_count} unparsed")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found", file=sys.stderr)
        sys.exit(1)
    except PermissionError:
        print(f"Error: Permission denied accessing '{input_file}' or '{output_file}'", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error: {str(e)}", file=sys.stderr)
        sys.exit(1)

def main():
    args = parse_arguments()

    # Determine output file
    if args.output:
        output_file = args.output
    else:
        # Replace .vcf extension with .bed
        base_name = os.path.splitext(args.input_vcf)[0]
        output_file = base_name + '.bed'

    # Convert VCF to BED with enhanced options
    vcf_to_bed(args.input_vcf, output_file,
               parse_bnd=(not args.no_BND_parse),
               include_ins=args.include_INS)

if __name__ == '__main__':
    main()