msmc workflow

Generating consensus sequences for each sample each chromosme

1
2
3
4
5
6
7
8
9
10
11
DATAPATH="/users/lbuggiotti/WGresRussia/umd_3_1"
DATA="/users/lbuggiotti/WGresRussia/raw_data/BQSR_recalBAM"
chr="./bamlist.txt"

B=$(($SGE_TASK_ID-1))
IDX=$(($B % 40))
input=$(tail -n+${IDX} ${chr} | head -n1)

for i in Chr{1..29} ChrX; do /users/hzhou/tools/samtools/bin/samtools mpileup -q 20 -Q 20 -C 50 -u -r ${i} -f ${DATAPATH}/umd_3_1_reference.fa ${input} |
/users/hzhou/tools/bcftools-1.3.1/bcftools call -c -V indels |
/users/hzhou/tools/python/bin/python3.6 /users/hzhou/tools/msmc-tools/bamCaller.py 11 ${input##*/}.${i}_mask.bed.gz| gzip -c > ${input##*/}.${i}.vcf.gz ;done

Combining samples for each chromosme(high coverage)

1
for i in Chr{1..29} ChrX; do /users/hzhou/miniconda3/bin/python3.6 /users/hzhou/tools/msmc-tools/generate_multihetsep.py --chr ChrX --mask=./A7.${i}_mask.bed.gz --mask=./A25.${i}_mask.bed.gz --mask=./A1.${i}_mask.bed.gz --mask=./A5.${i}_mask.bed.gz --mask=./A17.${i}_mask.bed.gz --mask=./A13.${i}_mask.bed.gz --mask=./A3.${i}_mask.bed.gz --mask=./A15.${i}_mask.bed.gz --mask=./A8.${i}_mask.bed.gz --mask=./A16.${i}_mask.bed.gz --mask=./A41.${i}_mask.bed.gz --mask=./A42.${i}_mask.bed.gz --mask=./A38.${i}_mask.bed.gz --mask=./A26.${i}_mask.bed.gz --mask=./A43.${i}_mask.bed.gz --mask=./A27.${i}_mask.bed.gz --mask=./A37.${i}_mask.bed.gz --mask=./A40.${i}_mask.bed.gz --mask=./A33.${i}_mask.bed.gz --mask=./A39.${i}_mask.bed.gz --mask=/users/hzhou/tools/seqbility-20091110/umd_3_1_35_chr${i}.mask.bed.gz A7.${i}.vcf.gz A25.${i}.vcf.gz A1.${i}.vcf.gz A5.${i}.vcf.gz A17.${i}.vcf.gz A13.${i}.vcf.gz A3.${i}.vcf.gz A15.${i}.vcf.gz A8.${i}.vcf.gz A16.${i}.vcf.gz A41.${i}.vcf.gz A42.${i}.vcf.gz A38.${i}.vcf.gz A26.${i}.vcf.gz A43.${i}.vcf.gz A27.${i}.vcf.gz A37.${i}.vcf.gz A40.${i}.vcf.gz A33.${i}.vcf.gz A39.${i}.vcf.gz > ./Y10K10.high.msmcinput.txt/${i}.YandK.msmcinput.txt ;done
  • mask files are in /users/hzhou/tools/seqbility-20091110/

the following script to build bos taurus mask files

1
2
3
4
5
6
7
8
9
module load apps/bwa/0.7.10/gcc-4.4.7
/users/hzhou/tools/seqbility-20091110/splitfa /users/lbuggiotti/WGresRussia/umd_3_1/umd_3_1_reference.fa 35 | split -l 20000000
ls ./x* |while read id ;do bwa aln -t 10 -R 1000000 -O 3 -E 3 /users/lbuggiotti/WGresRussia/umd_3_1/umd_3_1_reference.fa ${id} > ${id}.sai ;done
ls ./x?? |while read id ;do bwa samse -f ${id}.sam /users/lbuggiotti/WGresRussia/umd_3_1/umd_3_1_reference.fa ${id}.sai ${id} && gzip ${id}.sam ;done
wait
gzip -dc x*.sam.gz | ./gen_raw_mask.pl > rawMask_umd_3_1_35.fa
wait
./gen_mask -l 35 -r 0.5 rawMask_umd_3_1_35.fa > mask_35_50.fa
/users/hzhou/tools/python2.7/bin/python2.7 /users/hzhou/tools/msmc-tools/makeMappabilityMask.py

Estimating the effective population size

module load apps/msmc

for K

1
msmc2 -t 5 -o k.h.10h_msmc_output -I 0,1,2,3,4,5,6,7,8,9 Chr1.YandK.msmcinput.txt Chr2.YandK.msmcinput.txt Chr3.YandK.msmcinput.txt Chr4.YandK.msmcinput.txt Chr5.YandK.msmcinput.txt Chr6.YandK.msmcinput.txt Chr7.YandK.msmcinput.txt Chr8.YandK.msmcinput.txt Chr9.YandK.msmcinput.txt Chr10.YandK.msmcinput.txt Chr11.YandK.msmcinput.txt Chr12.YandK.msmcinput.txt Chr13.YandK.msmcinput.txt Chr14.YandK.msmcinput.txt Chr15.YandK.msmcinput.txt Chr16.YandK.msmcinput.txt Chr17.YandK.msmcinput.txt Chr18.YandK.msmcinput.txt Chr19.YandK.msmcinput.txt Chr20.YandK.msmcinput.txt Chr21.YandK.msmcinput.txt Chr22.YandK.msmcinput.txt Chr23.YandK.msmcinput.txt Chr24.YandK.msmcinput.txt Chr25.YandK.msmcinput.txt Chr26.YandK.msmcinput.txt Chr27.YandK.msmcinput.txt Chr28.YandK.msmcinput.txt Chr29.YandK.msmcinput.txt ChrX.YandK.msmcinput.txt

for Y

1
msmc2 -t 5 -o y.h.10h_msmc_output -I 20-20,20-21,20-22,20-23,20-24,20-25,20-26,20-27,20-28,20-29,21-21,21-22,21-23,21-24,21-25,21-26,21-27,21-28,21-29,22-22,22-23,22-24,22-25,22-26,22-27,22-28,22-29,23-23,23-24,23-25,23-26,23-27,23-28,23-29,24-24,24-25,24-26,24-27,24-28,24-29,25-25,25-26,25-27,25-28,25-29,26-26,26-27,26-28,26-29,27-27,27-28,27-29,28-28,28-29 Chr1.YandK.msmcinput.txt Chr2.YandK.msmcinput.txt Chr3.YandK.msmcinput.txt Chr4.YandK.msmcinput.txt Chr5.YandK.msmcinput.txt Chr6.YandK.msmcinput.txt Chr7.YandK.msmcinput.txt Chr8.YandK.msmcinput.txt Chr9.YandK.msmcinput.txt Chr10.YandK.msmcinput.txt Chr11.YandK.msmcinput.txt Chr12.YandK.msmcinput.txt Chr13.YandK.msmcinput.txt Chr14.YandK.msmcinput.txt Chr15.YandK.msmcinput.txt Chr16.YandK.msmcinput.txt Chr17.YandK.msmcinput.txt Chr18.YandK.msmcinput.txt Chr19.YandK.msmcinput.txt Chr20.YandK.msmcinput.txt Chr21.YandK.msmcinput.txt Chr22.YandK.msmcinput.txt Chr23.YandK.msmcinput.txt Chr24.YandK.msmcinput.txt Chr25.YandK.msmcinput.txt Chr26.YandK.msmcinput.txt Chr27.YandK.msmcinput.txt Chr28.YandK.msmcinput.txt Chr29.YandK.msmcinput.txt ChrX.YandK.msmcinput.txt

plot using R script

1
2
3
4
5
6
7
8
9
10
setwd("D:/Data_analysis/evolution/MSMC/yh10.29chr/")
mu <- 1.25e-8
gen <- 5
yDat<-read.table("y.h.10h_msmc_output.final.txt", header=TRUE)
kDat<-read.table("k.h.10h_msmc_output.final.txt", header=TRUE)
plot(kDat$left_time_boundary/mu*gen, (1/kDat$lambda)/mu, log="x",ylim=c(0,100000),
type="n", xlab="Years ago", ylab="effective population size")
lines(yDat$left_time_boundary/mu*gen, (1/yDat$lambda)/mu, type="s", col="red")
lines(kDat$left_time_boundary/mu*gen, (1/kDat$lambda)/mu, type="s", col="blue")
legend("topright",legend=c("Yakut", "Kholmogory"), col=c("red", "blue"), lty=c(1,1))
  • set the u and g here

reference:

https://github.com/stschiff/msmc-tools/blob/master/msmc-tutorial/guide.md