bioinfo

BWA-MEM Workflow

worker1 is responsible to accomplish the alignments

1
worker1 -> mem_align1_core -> mem_chain

worker2 is responsible to transfer alignments to BAM/SAM format

1
worker2 -> mem_reg2sam -> mem_aln2sam

mem_align1_core

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf)

/** Actually are mem_align1
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
* generated later by mem_reg2aln() below.
*
*
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference ???
* @param pac 2-bit encoded reference ???
* @param l_seq length of query sequence
* @param seq query sequence
*
* @return list of aligned regions.
*/

Steps:

  1. convert to 2-bit encoding if we have not done so ???
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
    where in bntseq.c
    unsigned char nst_nt4_table[256] = {
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
    };
  • Chaining bwamem.c/mem_chain

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    typedef struct {
    int n, m, first, rid;
    uint32_t w:29, kept:2, is_alt:1;
    float frac_rep;
    int64_t pos;
    mem_seed_t *seeds;
    } mem_chain_t;

    typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;

    mem_chain_v (const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf)
  • Filtering chain bwamem.c/mem_chain_flt

  • Filtering seeds in chain bwamem.c/mem_flt_chained_seeds

BWA-MEM Data Structure

bwamem.h/mem_opt_t BWA-MEM’s input options

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
typedef struct {
int a, b; // match score and mismatch penalty
int o_del, e_del; // ??
int o_ins, e_ins; // ??
int pen_unpaired; // ?? phred-scaled penalty for unpaired reads
int pen_clip5,pen_clip3;// ?? clipping penalty. This score is not deducted from the DP score.
int w; // ?? band width
int zdrop; // ? Z-dropoff

uint64_t max_mem_intv; // ??

int T; // ?? output score threshold; only affecting output
int flag; // ?? see MEM_F_* macros
int min_seed_len; // ? minimum seed length
int min_chain_weight; // ??
int max_chain_extend; // ?
float split_factor; // ? split into a seed if MEM is longer than min_seed_len*split_factor
int split_width; // ? split into a seed if its occurence is smaller than this value
int max_occ; // ? skip a seed if its occurence is larger than this value
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // ? process chunk_size-bp sequences in a batch
float mask_level; // ? regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float drop_ratio; // ? drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
float XA_drop_ratio; // ? when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
float mask_level_redun; // ??
float mapQ_coef_len; // ??
int mapQ_coef_fac; // ??
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // ? scoring matrix; mat[0] == 0 if unset
} mem_opt_t;