Breeding-Insight · Cristianetaniguti · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/.idea/BIGapp-PanelHub.iml b/.idea/BIGapp-PanelHub.iml
diff --git a/.idea/ChatGPTCopilotChannelManager.xml b/.idea/ChatGPTCopilotChannelManager.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/python-terminal.xml b/.idea/python-terminal.xml
diff --git a/.idea/vagrant.xml b/.idea/vagrant.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/20250702-build_cucumber_madc_matchHap_template.bash b/20250702-build_cucumber_madc_matchHap_template.bash
@@ -0,0 +1,181 @@
+#!/bin/bash
+# Base db was already generated for REF and ALT alleles
+
+# Note: change here ###### 
+# The readme file should be one version above the current db version
+# If no new alleles are found, give the readme file another name
+PROCESS_README='/Users/dz359/PycharmProjects/BI/cucumber_dartag_00_microhaplotype_db/data/cucumber_allele_db_v002_process.readme'
+NO_NEW_ALLELE_README='/Users/dz359/PycharmProjects/BI/cucumber_dartag_00_microhaplotype_db/data/cucumber_allele_db_v001_DCu25-10410_noNewAllele.readme'
+exec &> $PROCESS_README
+
+# Scripts and files
+SCRIPTS_DIR='/Users/dz359/PycharmProjects/BI/01_dartag_alleles/code/02_fixedAlleleID'
+CODE_VER='v1'
+MARKERID_LUT='/Users/dz359/PycharmProjects/BI/cucumber_dartag_00_microhaplotype_db/data/f180bp/20221019-BI-Cucumber_DArTag-probe-desgin_snpID_lut.csv'
+ALLELE_DB_DIR='/Users/dz359/PycharmProjects/BI/cucumber_dartag_00_microhaplotype_db/data'
+ALLELE_DB='cucumber_allele_db_v001.fa'
+MATCHCNT_LUT='cucumber_allele_db_v001_matchCnt_lut.txt'
+REPORT='/Users/dz359/PycharmProjects/BI/cucumber_dartag_P06_qtlMapping_5plates/data/Report-DCu25-10410/DCu25-10410_MADC.csv'
+
+FIRST_SAMPLE_COL=17
+
+# Marker panel information
+DESIGN_LEN=81
+SEQ_LEN=109
+
+
+# Processing starts here
+now=$(date)
+printf "%s\n" "$now"
+
+
+printf "\n# 1). Update snpIDs to Chr_00xxxxxxx format in MADC"
+printf "\n  # This is necessary because marker IDs do not always follow the Chr_00xxxxxxx format\n"
+REPORT_SNPID=${REPORT%????}'_snpID.csv'
+if [ -f $REPORT_SNPID ]; then
+    echo "$REPORT_SNPID exists. Skip this step."
+else
+    python $SCRIPTS_DIR/step00_madc_update_snpID_v1.1.py $MARKERID_LUT $REPORT
+fi
+
+
+printf "\n# 2). Check if there are any duplicate alleles - same microhaplotypes with different IDs"
+printf "\n  # If there are duplicates, retain only one allele of the duplicated ones\n"
+# https://stackoverflow.com/questions/947897/how-to-comment-a-block-of-code-in-bash
+
+python $SCRIPTS_DIR/step00_check_allele_uniqueness_AND_update_madc_v1.py $REPORT_SNPID $FIRST_SAMPLE_COL
+REPORT_SNPID_UNIQ=${REPORT%????}'_snpID_uniq.csv'
+if test -f "$REPORT_SNPID_UNIQ"; then
+    printf "\n  # Updated MADC file: $REPORT_SNPID_UNIQ\n"
+    REPORT_SNPID=$REPORT_SNPID_UNIQ
+else
+    printf "\n  # No duplicates found in MADC file\n"
+fi
+
+
+printf "\n# 3). Filter alleles with missing data and generate RefMatch and AltMatch allele fasta and a temporary report\n"
+python $SCRIPTS_DIR/step01_filter_missing_AND_ext_matchAlleles_from_madc_v1.py $REPORT_SNPID
+# The output of this step is a temporary report file with renamed marker IDs and RefMatch and AltMatch allele fasta files
+TMP_RENAME=${REPORT_SNPID%????}'_tmp_rename.csv'
+MATCH_ALLELES=${REPORT_SNPID%????}'_match.fa'
+
+
+printf "\n#  4). Remove adapters using cutadapt"
+#  -n COUNT, --times COUNT
+#                        Remove up to COUNT adapters from each read. Default: 1
+#  -O MINLENGTH, --overlap MINLENGTH
+#                        Require MINLENGTH overlap between read and adapter for an adapter to be found.
+#                        Default: 3
+# -a ACCGATCTCGTATGCCGTCTTCTGCTTG 
+if [ $DESIGN_LEN -eq $SEQ_LEN ]; then
+    printf "\n  # Both design length and sequencing length are the same. No need to run adaptor checking\n"
+else
+    printf "\n  # Sequencing length is longer than panel design length, run cutadapt\n"
+    CUTADAPT=${REPORT_SNPID%????}'_match_cutadapt.fa'
+    CUT_LOG=${REPORT_SNPID%????}'_match_cutadapt.log'
+    /Users/dz359/Library/Python/3.9/bin/cutadapt -a ACCGATCTG -e 0.2 -n 1 --overlap 5 -m $DESIGN_LEN -o $CUTADAPT $MATCH_ALLELES > $CUT_LOG
+    # Some Illumina instruments use a two-color chemistry to encode the four bases. This includes the NextSeq and the NovaSeq. 
+    # In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+    # However, dark cycles also occur when sequencing “falls off” the end of the fragment. 
+    # The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+    MATCH=$(grep -c ">" $MATCH_ALLELES)
+    MATCH_CUT=$(grep -c ">" $CUTADAPT)
+    printf "\n    # Number of RefMatch and AltMatch extracted from MADC: $MATCH"
+    printf "\n    # Number of RefMatch and AltMatch retained after cutadapt: $MATCH_CUT\n" 
+
+    printf "\n#  5). Check if there are duplicate alleles after removing adapters AND update allele sequences and read counts\n"
+    printf "  # CUTADAPT file: $CUTADAPT\n"
+    printf "  # Temporary rename file: $TMP_RENAME\n"
+    python $SCRIPTS_DIR/step03_check_cutadapt_allele_uniqueness_AND_update_tmp_rename_report_v1.1.py $CUTADAPT $ALLELE_DB_DIR/$ALLELE_DB $TMP_RENAME
+    CUTADAPT_UNI=${REPORT_SNPID%????}'_match_cutadapt_unique.fa'
+    TMP_RENAME_UPDATED=${REPORT_SNPID%????}'_tmp_rename_updatedSeq.csv'
+fi
+
+
+printf "\n#  6a). First, create the BLAST database"
+# 1. First, create the BLAST database
+makeblastdb -in $ALLELE_DB_DIR/$ALLELE_DB \
+            -dbtype nucl \
+            -parse_seqids
+
+printf "\n#  6b). BLAST RefMatch and AltMatch against the allele db\n"
+# If there are no duplicate alleles, there won't be the a '_match_cutadapt_unique.fa'
+# There won't be a '_tmp_rename_updatedSeq.csv' either
+# Here, use if else to execute different input files.
+if test -f "$CUTADAPT"; then
+    if test -f "$CUTADAPT_UNI"; then
+      BLAST_DBBLAST=${REPORT_SNPID%????}'_match_cutadapt_unique.fa.alleledb.bn'
+      blastn -task blastn-short -dust no -soft_masking false -db $ALLELE_DB_DIR/$ALLELE_DB -query $CUTADAPT_UNI -out $BLAST_DBBLAST -evalue 1e-5 -num_threads 6 -max_target_seqs 15 -outfmt '6 qseqid qlen qstart qend sseqid slen sstart send length qcovs pident evalue'
+    else
+      BLAST_DBBLAST=${REPORT_SNPID%????}'_match_cutadapt.fa.alleledb.bn'
+      blastn -task blastn-short -dust no -soft_masking false -db $ALLELE_DB_DIR/$ALLELE_DB -query $CUTADAPT -out $BLAST_DBBLAST -evalue 1e-5 -num_threads 6 -max_target_seqs 15 -outfmt '6 qseqid qlen qstart qend sseqid slen sstart send length qcovs pident evalue'
+    fi 
+else
+    BLAST_DBBLAST=${REPORT_SNPID%????}'_match.fa.alleledb.bn'
+    blastn -task blastn-short -dust no -soft_masking false -db $ALLELE_DB_DIR/$ALLELE_DB -query $MATCH_ALLELES -out $BLAST_DBBLAST -evalue 1e-5 -num_threads 6 -max_target_seqs 15 -outfmt '6 qseqid qlen qstart qend sseqid slen sstart send length qcovs pident evalue'
+fi
+
+
+printf "\n#  7). Determine status of RefMatch and AltMatch and Assign fixed IDs to them\n"
+if test -f "$CUTADAPT_UNI"; then
+    python $SCRIPTS_DIR/step05_parse_madc_allele81bp_blastn_v1.py $ALLELE_DB_DIR/$MATCHCNT_LUT $ALLELE_DB_DIR/$ALLELE_DB $TMP_RENAME_UPDATED $BLAST_DBBLAST
+    MADC_CLEANED=${REPORT_SNPID%????}'_rename_updatedSeq.csv'
+else
+    python $SCRIPTS_DIR/step05_parse_madc_allele81bp_blastn_v1.py $ALLELE_DB_DIR/$MATCHCNT_LUT $ALLELE_DB_DIR/$ALLELE_DB $TMP_RENAME $BLAST_DBBLAST
+    MADC_CLEANED=${REPORT_SNPID%????}'_rename.csv'
+fi
+
+
+
+printf "\n#  8). Check if there is a new version DB\n"
+# If there are new alleles added to the db, a new version of db will be generated
+# Otherwise, no new db
+VER=$(echo $ALLELE_DB | grep -o 'v[0-9]\{3\}' | cut -c2- | sed 's/^0*//')
+NEW_VER=$(printf '%03d' $(($VER+1)))
+ALLELE_DB_NEW=${ALLELE_DB%??????}$NEW_VER'.fa'
+if test -f "$ALLELE_DB_DIR/$ALLELE_DB_NEW"; then
+    printf "  # New version of db created after adding noval alleles.\n"
+    printf "  $ALLELE_DB_DIR/$ALLELE_DB_NEW\n"
+    makeblastdb -in $ALLELE_DB_DIR/$ALLELE_DB_NEW -dbtype nucl
+
+    printf "\n#  9). Check if there are duplicates."
+    printf "  # If there are duplicates, retain only one allele of the duplicated ones\n"
+    #cat $ALLELE_DB_NEW_BLAST | awk '$1!=$5 && $10==100 && $11==100.0' | more
+    python $SCRIPTS_DIR/step06_check_db_allele_uniqueness_v1.py $ALLELE_DB_DIR/$ALLELE_DB_NEW
+
+    printf "\n#  10). If there are duplicates in the new version DB, update MADC after removing duplicated alleles in db\n"
+    DUP=$ALLELE_DB_DIR/$ALLELE_DB_NEW'.dup.csv'
+    if test -f "$DUP"; then
+        VER=$(echo $ALLELE_DB | grep -o 'v[0-9]\{3\}' | cut -c2- | sed 's/^0*//')
+        NEW_VER_RMDUP=$(printf '%03d' $(($VER+2)))
+        ALLELE_DB_NEW_RMDUP=${ALLELE_DB%??????}$NEW_VER_RMDUP'.fa'
+        makeblastdb -in $ALLELE_DB_DIR/$ALLELE_DB_NEW_RMDUP -dbtype nucl
+        printf "  # There are duplicate alleles in db. Check if these duplicate alleles are in MADC file.\n"
+        python $SCRIPTS_DIR/step06_update_MADC_with_allele_uniqueness_v1.py $DUP $MADC_CLEANED
+        printf "\n#  11). Add version of the script to the MADC with fixed allele IDs\n"
+        if test -f "$CUTADAPT_UNI"; then
+            MADC_CLEANED_RMDUP=${REPORT_SNPID%????}'_rename_updatedSeq_rmDup.csv'
+            MADC_CLEANED_RMDUP_VER=${REPORT_SNPID%????}'_rename_updatedSeq_rmDup_'$CODE_VER'.csv'
+        else
+            MADC_CLEANED_RMDUP=${REPORT_SNPID%????}'_rename_rmDup.csv'
+            MADC_CLEANED_RMDUP_VER=${REPORT_SNPID%????}'_rename_rmDup_'$CODE_VER'.csv'
+        fi
+        awk -v val="$CODE_VER" 'NR==1{print "Code_version," $0} NR>1{print val "," $0}' $MADC_CLEANED_RMDUP > $MADC_CLEANED_RMDUP_VER
+    else
+        printf "  # No duplicate alleles found in microhap db\n"
+        echo $ALLELE_DB_DIR/$ALLELE_DB_NEW
+        printf "\n#  11). Add version of the script to the MADC with fixed allele IDs\n"
+        if test -f "$CUTADAPT_UNI"; then
+            MADC_CLEANED_VER=${REPORT_SNPID%????}'_rename_updatedSeq_'$CODE_VER'.csv'
+        else
+            MADC_CLEANED_VER=${REPORT_SNPID%????}'_rename_'$CODE_VER'.csv'
+        fi
+        awk -v val="$CODE_VER" 'NR==1{print "Code_version," $0} NR>1{print val "," $0}' $MADC_CLEANED > $MADC_CLEANED_VER
+        printf "  # Version added to as the first column of the output file.\n"
+    fi 
+else
+    printf "  # No new alleles found, therefore, no new db generated.\n"
+    mv $PROCESS_README $NO_NEW_ALLELE_README
+fi
+
+printf "\n###### Complete! #######\n"
diff --git a/README.md b/README.md
@@ -8,6 +8,64 @@ A central metadata repository of marker-panel definitions for downstream workflo
 - **Workflow Role:** During MADC-to-VCF conversion, these files tell BIGapp/BIGr exactly which sequences to extract as reverse complements.  
 - **Format:** Plain-text lists—one marker ID per line (optionally namespaced by panel or species).
 
+## `_lut.csv` Files
+
+- **Purpose:** Provide mappable CloneID with the botloci files in case MADC CloneID doesn't match; provide REF and ALT bases for speedup madc2vcf_targets; provide indels position and length. 
+- **Workflow Role:** Used by all `madc2vcf`. If absent, results are limited depending on the format of the input.  
+- **Format:** CSV containing columns: Panel_markerID, BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos, Note
+
+## `_v001.fa` Files
+
+- **Purpose:** first version of the microhaplotype DB. Serves to recover possible missing Ref and Alt tags 
+- **Workflow Role:** Used by all `madc2vcf`. If absent, results are limited depending on the format of the input.  
+- **Format:** fasta file containing header with CloneID followed by the AlleleSequence
+
+## MADC test files:
+
+### Alfalfa
+
+* `alfalfa_IUPAC.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * AlleleSequences have IUPAC codes
+* `alfalfa_lowercase.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * AlleleSequences have mix of lower and upper case
+    * has 3 Ref and 1 Alt tag missing
+* `alfalfa_madc_raw.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * has the raw MADC header
+    * AlleleID is not fixed (doesn't have _0001 or _0002)
+* `alfalfa_madc_wrongID.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * CloneID doesn't match botloci (CloneID = chr01_[position]; botloci = chr01.1_[position])
+* `alfalfa_madc.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * all clean
+* `alfalfa_marker_info_ChromPos.csv`: 
+    * version of _lut.csv file without the REF and ALT
+
+### Potato
+
+* `potato_indel_IUPAC.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * contains indels and AlleleSequences have IUPAC codes
+* `potato_indel_lowercase.csv`: 
+    * 300 CloneIDs and 30 samples simulated
+    * contains indels 
+    * AlleleSequences have mix of lower and upper case 
+* `potato_indel_madc.csv`: 
+    * 300 CloneIDs and 30 samples simulated 
+    * contains indels
+* `potato_marker_info_chrompos.csv`: 
+    * version of _lut.csv file without the REF and ALT
+* `potato_more_indels_madc_ChromPosFALSE.csv`: 
+    * 300 CloneIDs and 30 samples simulated 
+    * contains indels 
+    * not all CloneID have the Chr_Pos format
+
+### Simulation script
+* `generate_simu_madc.R`: script used to generate the simulated MADC files
+
 ---
 
-For full usage details, file format specifications, and version history, see the [BIGapp ](https://github.com/Breeding-Insight/BIGapp) and [BIGr](https://github.com/Breeding-Insight/BIGr) functions `madc2vcf_all` and `madc2vcf_targets` documentations.
+For full usage details, file format specifications, and version history, see the [BIGapp ](https://github.com/Breeding-Insight/BIGapp) and [BIGr](https://github.com/Breeding-Insight/BIGr) functions `madc2vcf_all`, `madc2vcf_targets` and `madc2vcf_multi` documentations.