#!/usr/bin/perl -w #Design: Feng Cheng, Chengcheng Cai #Program: Feng Cheng #Bug report: chengfeng@caas.cn use strict; use warnings; use threads; use Getopt::Long; =head1 program discription Call Variations From Population Samples With High Accuracy Prerequiried tools for running: BWA, Samtools =head1 Command-line Option --inDir [required] The directory where the population resequencing reads are stored. --outDir [required] The directory for store the filtered reads and other temp file. --qual [required] File to provide the illumina reads quality version 1.8 (all >=1.8), or 1.3 (all <1.8). --ref [required] Fasta format of the reference genome. --outStat [optional] File to store the statistics during filtering raw reads. --Gsize [optional] The size of the genome in units of Gb. Automatic calculated from the reference genome as default. --folds [optional] How many folds for each sample use to do the calling. Use all data as default [0]. --build [optional] Build index [1] for the reference, or don't build it [0] incase it has been built previously. [1]. --pop [optional] The file of polymorphic loci for variant calling. If not exist, will perform pooled mapping as default [1] --sam [optional] File to store the samples used in the variant calling. --pos [optional] File to store the genomic position of variants called. --geno [optional] File to store the genotypes of variants called. --help =head1 Usage perl popSeq2Geno.pl -inDir example_reads/ -outDir example_reads_temp/ -qual example_quality -ref example_genome.fa -folds 1 -build 1 -pop 1 =cut ##--Initialize input data--## my ($dir,$outDir,$qual,$stat,$gsize,$fold,$refG,$bref,$pop,$sam,$pos,$geno,$help); GetOptions( "inDir=s"=>\$dir, "outDir=s"=>\$outDir, "qual=s"=>\$qual, "outStat=s"=>\$stat, "Gsize=s"=>\$gsize, "folds=s"=>\$fold, "ref=s"=>\$refG, "build=s"=>\$bref, "pop=s"=>\$pop, "sam=s"=>\$sam, "pos=s"=>\$pos, "geno=s"=>\$geno, "help"=>\$help, ); die `pod2text $0` if($help); die `pod2text $0` if(!defined $dir); die `pod2text $0` if(!defined $outDir); die `pod2text $0` if(!defined $qual); die `pod2text $0` if(!defined $refG); if(!defined $stat) { $stat = "readsFilter.statistics"; } if(!defined $gsize) { $gsize = &readGsize($refG); } if(!defined $fold) { $fold = 0; } if(!defined $bref) { $bref = 1; } if(!defined $pop) { $pop = 1; } if(!defined $sam) { $sam = "samples"; } if(!defined $pos) { $pos = "positions.loci"; } if(!defined $geno) { $geno = "genotypes.loci"; } if($pop !~ /^1$/) { $fold = 0; } #--Launch the running process----------------------- &process_main($dir,$outDir,$qual,$stat,$gsize,$fold,$refG,$bref,$pop,$sam,$pos,$geno); sub process_main { my ($dir,$outDir,$qual,$stat,$gsize,$fold,$ref,$bref,$pop,$sam,$pos,$geno) = @_; my (%file,%rfile2rec); &readFile(\%file,\%rfile2rec,$dir); if(!(-e $outDir)) { system("mkdir $outDir"); } my $fOutDir = $outDir."reads_ft/"; # &qualityFilter($qual,\%file,\%rfile2rec,$dir,$fOutDir,$stat); print "\nAll filtering process run successfully...\n\n"; my %sam2fold; &readStat($stat,\%sam2fold,$gsize); my $pfOutDir = $outDir."reads_sel/"; &pickOutReads($fOutDir,$pfOutDir,$fold,\%sam2fold); if($bref == 1) { system("bwa index $ref"); #--format reference-- system("samtools faidx $ref"); } my $mapDir = $outDir."mapped/"; my $plist; if($pop =~ /^1$/) { $plist = "populationPolymorphicLoci.posAlleleT"; &bMapPop($pfOutDir,$mapDir,$ref,$plist,$fold,\%sam2fold); } else { $plist = $pop; } my $mapDir2 = $mapDir; if($fold == 0) { &bMapIndi($pfOutDir,$mapDir2,$ref,$plist,$pos,$geno,$sam,$fold,\%sam2fold,$pop); } else { $mapDir2 = $outDir."mapped2/"; &bMapIndi($fOutDir,$mapDir2,$ref,$plist,$pos,$geno,$sam,$fold,\%sam2fold,$pop); } } sub readStat { my ($stat,$sam2fold,$gsize) = @_; open(my $FR,$stat); while($_=<$FR>) { chomp; my @temp = split(/\t/); $temp[0] =~ /\/([^\/]+)_1\.fq[s]*\.gz/; my $sam = $1; my $rfold = $temp[7]/1000000000/$gsize; $sam2fold->{$sam} = $rfold; } close($FR); } sub pickOutReads { my ($fOutDir,$pfOutDir,$fold,$sam2fold) = @_; if(-e $pfOutDir) { system("rm -rf $pfOutDir"); } system("mkdir $pfOutDir"); if($fold!=0) { foreach my $sam (sort {$a cmp $b} keys %$sam2fold) { my $r1 = $sam."_1.fq.ft.gz"; my $r2 = $sam."_2.fq.ft.gz"; if($sam2fold->{$sam}>$fold) { my $nfold = $fold/$sam2fold->{$sam}; my $in1 = $fOutDir.$r1; my $in2 = $fOutDir.$r2; open(my $FR1,"gzip -dc $in1 |"); open(my $FR2,"gzip -dc $in2 |"); my $out1 = $pfOutDir.$r1; my $out2 = $pfOutDir.$r2; open(my $FW1,"| gzip >$out1"); open(my $FW2,"| gzip >$out2"); my (@L1,@L2); while($_=<$FR1>) { $L1[0] = $_; $L1[1] = <$FR1>; $L1[2] = <$FR1>; $L1[3] = <$FR1>; $L2[0] = <$FR2>; $L2[1] = <$FR2>; $L2[2] = <$FR2>; $L2[3] = <$FR2>; if(rand(1)<$nfold) { print $FW1 $L1[0]; print $FW1 $L1[1]; print $FW1 $L1[2]; print $FW1 $L1[3]; print $FW2 $L2[0]; print $FW2 $L2[1]; print $FW2 $L2[2]; print $FW2 $L2[3]; } } close($FR1); close($FR2); close($FW1); close($FW2); } else { my $ls1 = $fOutDir.$r1; my $ls2 = $fOutDir.$r2; $ls1 =~s/^[^\/]+\///; $ls2 =~s/^[^\/]+\///; system("ln -s ../$ls1 $pfOutDir"); system("ln -s ../$ls2 $pfOutDir"); } } } else { foreach my $sam (sort {$a cmp $b} keys %$sam2fold) { my $r1 = $sam."_1.fq.ft.gz"; my $r2 = $sam."_2.fq.ft.gz"; my $ls1 = $fOutDir.$r1; my $ls2 = $fOutDir.$r2; $ls1 =~s/^[^\/]+\///; $ls2 =~s/^[^\/]+\///; system("ln -s ../$ls1 $pfOutDir"); system("ln -s ../$ls2 $pfOutDir"); } } } sub readGsize { my ($in) = @_; my %sca2seq; &readFasta($in,\%sca2seq); my $gsize = 0; foreach my $sca (keys %sca2seq) { $gsize += length($sca2seq{$sca}); } $gsize /= 1000000000; return $gsize; } sub readFasta { my ($in,$id2seq) = @_; open(my $SFR,$in); my $id; while($_=<$SFR>) { if(/^>([^\s^\n]+)\s*\n*/) { $id = $1; $id2seq->{$id} = ""; } else { chomp; $id2seq->{$id} .= $_; } } close($SFR); } sub bMapPop { my ($dir,$outDir,$ref,$plist,$fold,$sam2fold) = @_; my (%file,%file2rec); ##print "$dir\n"; &readFile(\%file,\%file2rec,$dir); &bwaMapPop(\%file,\%file2rec,$dir,$outDir,$ref,$plist,$fold,$sam2fold); print "\nPopulation mapping process run successfully...\n\n"; &mergeBam($outDir,$plist,$ref,$fold,$sam2fold); } sub bwaMapPop { my ($file,$file2rec,$dir,$outDir,$ref,$plist,$fold,$sam2fold) = @_; if(-e $outDir) { system("rm -rf $outDir"); } system("mkdir $outDir"); my $sample; my $posF = $plist; #--it depends---- foreach $sample (sort {$a cmp $b} keys %$file) { print "\nStart processing $sample...\n"; my $in1 = $sample."_1.fq.ft.gz"; my $in2 = $sample."_2.fq.ft.gz"; #print "$in1\t$in2\n"; if(exists($file2rec->{$in1}) && exists($file2rec->{$in2})) { my $outSam = $outDir.$sample.".cf.sam"; &bmap($dir,$outDir,$in1,$in2,$outSam,$sample,$ref); } } } sub mergeBam { my ($outDir,$plist,$ref,$fold,$sam2fold) = @_; my $outDirm = $outDir."*.bam"; my $samID = "merge_mapped"; system("samtools merge $samID.bam $outDirm"); my $t1 = $samID.".bam.sorted"; system("samtools sort $samID.bam $t1"); system("rm $samID.bam"); my $t2 = $t1.".bam"; system("samtools index $t2"); my @sam = keys %$sam2fold; my $nsam = @sam; my $bdep; if($fold==0) { foreach my $sam (keys %$sam2fold) { $bdep += $sam2fold->{$sam}; } } else { foreach my $sam (keys %$sam2fold) { if($sam2fold->{$sam}<$fold) { $bdep += $sam2fold->{$sam}; } else { $bdep += $fold; } } } my ($minDep,$maxDep); $minDep = $bdep * 0.5; if($minDep < 5) { $minDep = 5; } $maxDep = $bdep * 2; system("samtools mpileup -q 30 -Q 30 -C50 -d $maxDep -L $maxDep -ugf $ref $t2 | bcftools view -p 0.9999 -vcg - > $samID.cons.vcf"); my $vfile = $samID.".cons.vcf"; ##print "$vfile\n"; &screenGenotypeBwa($vfile,$maxDep); system("awk '\(\$5!~/,/ && \$5~/\\w/ && \$6+\$7>=$minDep\)' $samID.cons.vcf.genotype > $plist"); system("rm $t2 $t2.bai"); system("rm $vfile $samID.cons.vcf.genotype"); } sub screenGenotypeBwa { my ($in,$dep) = @_; my $out = $in.".genotype"; open(my $FR,$in); open(my $FW,">$out"); while($_=<$FR>) { chomp; my @temp = split(/\t/); #if($temp[4]=~/\w/ && !($temp[4]=~/,/)) { #if(/^[^#]/ && $temp[4]=~/\w/) { if(/^[^#]/) { if($temp[7] =~ /;DP4=(\d+),(\d+),(\d+),(\d+);/) { my ($d1,$d2) = ($1+$2,$3+$4); my ($ref,$a1,$a2) = ($temp[3],$temp[3],$temp[4]); if($d1>=3 && $d1<=$dep && $d2>=3 && $d2<=$dep) { #if($a2 eq $ref) { # $a2 = $a1; # $a1 = $ref; #} #elsif($a1 ne $ref && $a2 ne $ref) { # $a1 = "N"; # $a2 = "N"; #} } elsif($d1>=3 && $d1<=$dep && $d2==0) { $a2 = $a1; $d2 = $d1; } elsif($d2>=3 && $d2<=$dep && $d1==0) { $a1 = $a2; $d1 = $d2; } else { $a1 = "N"; $a2 = "N"; } #if(!($a2=~/^\w$/)) { # $a1 = "N"; # $a2 = "N"; #} if($a1 ne "N" && $a2 ne "N" && $a1 ne $a2) { print $FW "$temp[0]\t$temp[1]\t$ref\t$a1\t$a2\t$d1\t$d2\n"; } } } } close($FR); close($FW); } sub bMapIndi { my ($dir,$outDir,$ref,$plist,$posiF,$genoF,$samF,$fold,$sam2fold,$pop) = @_; if($pop !~ /^1$/) { if(-e $outDir) { system("rm -rf $outDir"); } system("mkdir $outDir"); } my (%file,%file2rec); &readFile(\%file,\%file2rec,$dir); &bwaMap(\%file,\%file2rec,$dir,$outDir,$ref,$plist,$posiF,$genoF,$samF,$fold,$sam2fold,$pop); print "\nAll individual calling process run successfully...\n\n\n"; } sub bwaMap { my ($file,$file2rec,$dir,$outDir,$ref,$plist,$posiF,$genoF,$samF,$fold,$sam2fold,$pop) = @_; if(!(-e $outDir)) { system("mkdir $outDir"); } my ($sample,%i2geno); my $posF = $plist; #--it depends---- my (%i2pos,%i2alle); &readPos($posF,\%i2pos,\%i2alle); foreach $sample (sort {$a cmp $b} keys %$file) { print "\nStart processing $sample...\n"; my $in1 = $sample."_1.fq.ft.gz"; my $in2 = $sample."_2.fq.ft.gz"; if(exists($file2rec->{$in1}) && exists($file2rec->{$in2})) { my $outSam = $outDir.$sample.".cf.sam"; if($fold!=0 || $pop!~/^1$/) { &bmap($dir,$outDir,$in1,$in2,$outSam,$sample,$ref); } &callVar($outSam,$ref,\%i2geno,$posF,\%i2pos,\%i2alle); } } ##system("perl extractGP.pl $outDir $posF $posiF $genoF $samF"); &extractGP($outDir,$posF,$posiF,$genoF,$samF,$fold,$sam2fold); ##print "$outDir\n"; } sub extractGP { my ($outDir,$posF,$posiF,$genoF,$samF,$fold,$sam2fold) = @_; my (%i2pos,%i2alle); &readPos($posF,\%i2pos,\%i2alle); my %i2geno; &extractGeno($outDir,\%i2pos,\%i2alle,\%i2geno,$samF,$sam2fold); &output(\%i2pos,\%i2alle,\%i2geno,$posiF,$genoF); } sub output { my ($i2pos,$i2alle,$i2geno,$posiF,$genoF) = @_; my ($out1,$out2) = ($posiF,$genoF); #--it depends-- open(my $FW1,">$out1"); open(my $FW2,">$out2"); my $count = 0; foreach my $i (sort {$a <=> $b} keys %$i2pos) { my $alle = $i2alle->{$i}; my @ta = split(/:/,$alle); my $tg = $i2geno->{$i}; $tg =~ s/N//g; if($tg=~/\w/) { $tg =~ s/$ta[0]//g; $tg =~ s/$ta[1]//g; if(!($tg=~/\w/)) { my $pos = $i2pos->{$i}; $pos =~ s/-/\t/; print $FW1 "$pos\t$ta[0]\t$ta[1]\n"; print $FW2 "$i2geno->{$i}\n"; } } else { $count += 1; } } #print "$count\n"; close($FW1); close($FW2); } sub extractGeno { my ($outDir,$i2pos,$i2alle,$i2geno,$samF,$sam2fold) = @_; my $gfile; #.list; open(my $FW,">$samF"); opendir(DIR,$outDir) || die "Error in opening dir $outDir\n"; my $filename; my $mdep; while(($filename = readdir(DIR))){ if($filename=~/^(.+)\.cf\.vcf\.list/) { my $sam = $1; print $FW "$sam\n"; my $rfold = $sam2fold->{$sam}; if($rfold>3) { $mdep = 0; #**to be revised*************** } else { $mdep = 0; } my %pos2rec; &readData($outDir.$filename,\%pos2rec); foreach my $i (sort {$a <=> $b} keys %$i2pos) { my $pos = $i2pos->{$i}; my @sa; ($sa[0],$sa[1]) = ("N","N"); if(!exists($pos2rec{$pos})) { if(!exists($i2geno->{$i})) { $i2geno->{$i} = "NN"; } else { $i2geno->{$i} .= "NN"; } } else { my $data = $pos2rec{$pos}; my @temp = split(/\t/,$data); if($temp[7]=~/;DP4=(\d+),(\d+),(\d+),(\d+);/) { my ($d1,$d2) = ($1+$2,$3+$4); if($d1>$mdep && $d2>$mdep) { if($temp[3]=~/^\w$/ && $temp[4]=~/^\w$/) { $sa[0] = $temp[3]; $sa[1] = $temp[4]; } } elsif($d1>$mdep) { if($temp[3]=~/^\w$/ && $temp[4]=~/^.{1}$/) { $sa[0] = $temp[3]; $sa[1] = $temp[3]; } } elsif($d2>$mdep) { if($temp[3]=~/^\w$/ && $temp[4]=~/^\w$/) { $sa[0] = $temp[4]; $sa[1] = $temp[4]; } } } if(!exists($i2geno->{$i})) { $i2geno->{$i} = "$sa[0]$sa[1]"; } else { $i2geno->{$i} .= "$sa[0]$sa[1]"; } } } } } close($FW); closedir(DIR); } sub readData { my ($gfile,$pos2rec) = @_; open(my $FR,$gfile); my $index = 1; while($_=<$FR>) { if(!(/^#/) && !(/INDEL/)) { chomp; my @temp = split(/\t/); if($temp[3]=~/^\w$/ && $temp[4]=~/^[\.\w]$/) { my $pos = $temp[0]."-".$temp[1]; $pos2rec->{$pos} = $_; } } } close($FR); } sub callVar { my ($outSam,$ref,$i2geno,$posF,$i2pos,$i2alle) = @_; $outSam =~ /^(.+)\.sam$/; my $sample = $1; system("rm $outSam"); my $t1 = $sample.".bam"; my $t2 = $t1.".sorted"; system("samtools sort $t1 $t2"); system("rm $t1"); my $t3 = $t1.".sorted.bam"; system("samtools index $t3"); my $t4 = $sample.".vcf"; system("samtools mpileup -q 20 -Q 30 -C50 -l $posF -ugf $ref $t3 | bcftools view -p 0.9 -cg - > $t4.list"); system("rm $t3 $t3.bai"); print "$sample calling is clear now...\n"; } sub readPos { my ($posF,$i2pos,$i2alle) = @_; open(my $FR,$posF); my $index = 1; while($_=<$FR>) { if(!(/^#/)) { chomp; my @temp = split(/\t/); if($temp[3]=~/^\w$/ && $temp[4]=~/^\w$/) { $i2pos->{$index} = $temp[0]."-".$temp[1]; $i2alle->{$index} = $temp[3].":".$temp[4]; $index += 1; } } } close($FR); } sub readMdep { my ($in,$sample2dep) = @_; open(my $FR,$in); while($_=<$FR>) { chomp; my @temp = split(/\t/); $sample2dep->{$temp[0]} = $temp[1]; } close($FR); } sub bmap { my ($dir,$outDir,$in1,$in2,$out3,$sample,$ref) = @_; my $out1 = $outDir.$in1.".sai"; my $out2 = $outDir.$in2.".sai"; $in1 = $dir.$in1; $in2 = $dir.$in2; system("bwa mem -t 8 $ref $in1 $in2 > $out3"); $out3 =~ /^(.+)\.sam$/; my $sam = $1; my $t1 = $sam.".bam"; system("samtools view -bS $out3 > $t1"); print "$sam mapping is clear now...\n"; } #sub readFile { # # my ($file,$file2rec,$dir) = @_; # # opendir(DIR,$dir) || die "Error in opening dir $dir\n"; # my $filename; # # while(($filename = readdir(DIR))){ # if($filename=~/^(.+)_\d\.fq\.ft\.gz/) { #it depends--- _hb_1.fq.ft _hb_2.fq.ft # $file->{$1} = "Y"; # $file2rec->{$filename} = "Y"; # } # } # # closedir(DIR); # #} sub qualityFilter { my ($qual,$file,$file2rec,$dir,$outDir,$stat) = @_; if(!(-e $outDir)) { system("mkdir $outDir"); } if(-e $stat) { system("rm $stat"); } my $sample; foreach $sample (sort {$a cmp $b} keys %$file) { #print "$sample\t"; my $in1 = $sample."_1.fq.gz"; #--it depends-- my $in2 = $sample."_2.fq.gz"; if(exists($file2rec->{$in1}) && exists($file2rec->{$in2})) { #print "submit to filtering...\n"; my $out1 = $outDir.$sample."_1.fq.ft"; my $out2 = $outDir.$sample."_2.fq.ft"; $in1 = $dir.$in1; $in2 = $dir.$in2; &filterPE($in1,$in2,$qual,$out1,$out2,$stat); } else { print("PE data not available...\n"); } } } sub filterPE { my ($in1,$in2,$in3,$out1,$out2,$out3) = @_; my ($trimQ,$delAQ,$minLen,$Npercent) = (20,25,40,0.01); #--for merge map #--it depends-- &processFilter($in1,$in2,$in3,$out1,$out2,$out3,$trimQ,$delAQ,$minLen,$Npercent); } sub processFilter { my ($in1,$in2,$in3,$out1,$out2,$out3,$trimQ,$delAQ,$minLen,$Npercent) = @_; my %asc2ps; my $headLink; my %sam2pool; &readSam($in3,\%sam2pool); $in1 =~ /([^\/]+)_1.fq/; #print "$in1\t$1\n"; #-check code for quality #Sanger #!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ %asc2ps = ('!'=>0,'"'=>1,'#'=>2,'$'=>3,'%'=>4, '&'=>5,'\''=>6,'('=>7,')'=>8,'*'=>9, '+'=>10,','=>11,'-'=>12,'.'=>13,'/'=>14, '0'=>15,'1'=>16,'2'=>17,'3'=>18,'4'=>19, '5'=>20,'6'=>21,'7'=>22,'8'=>23,'9'=>24, ':'=>25,';'=>25,'<'=>27,"="=>28,'>'=>29, '?'=>30,'@'=>31,'A'=>32,'B'=>33,'C'=>34, 'D'=>35,'E'=>36,'F'=>37,'G'=>38,'H'=>39, 'I'=>40,'J'=>41); if(exists($sam2pool{$1})) { if($sam2pool{$1} eq "1.8") { $headLink = ' '; &filterPEreads($in1,$in2,$out1,$out2,$out3,\%asc2ps,$trimQ,$delAQ,$minLen,$Npercent,$headLink); } elsif($sam2pool{$1} eq "1.3") { my (@ins,@outs,@thr); ($ins[0],$ins[1]) = ($in1,$in2); $in1 =~ s/\.gz$//; $in2 =~ s/\.gz$//; $outs[0] = $in1."s"; #--it depends-- $outs[1] = $in2."s"; for(my $i=0;$i<=1;$i+=1) { $thr[$i] = threads->create(\&transQR,$ins[$i],$outs[$i]); } for(my $i=0;$i<=1;$i+=1) { $thr[$i]->join; } $headLink = '/'; $in1 = $outs[0].".gz"; $in2 = $outs[1].".gz"; &filterPEreads($in1,$in2,$out1,$out2,$out3,\%asc2ps,$trimQ,$delAQ,$minLen,$Npercent,$headLink); system("rm $in1 $in2"); } } } sub transQR { my ($in,$out) = @_; #system("perl fq2std.pl sol2std $in $out"); &fq2sg($in,$out); } sub fq2sg { my ($in,$out) = @_; my @conv_table; for (-64..64) { $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); } open(my $FR,"gzip -dc $in |"); open(my $FW,"| gzip >$out.gz"); while($_=<$FR>) { if (/^@/) { print $FW $_; $_ = <$FR>; print $FW $_; $_ = <$FR>; $_ = <$FR>; chomp; my @t = split('', $_); my $qual = ''; $qual .= $conv_table[ord($_)] for (@t); print $FW "+\n$qual\n"; } } close($FR); close($FW); } sub filterPEreads { my ($in1,$in2,$out1,$out2,$out3,$asc2ps,$trimQ,$delAQ,$minLen,$Npercent,$headLink) = @_; open(my $FR1,"gzip -dc $in1 |"); open(my $FR2,"gzip -dc $in2 |"); open(my $FW1,"| gzip >$out1.gz"); open(my $FW2,"| gzip >$out2.gz"); open(my $FW3,">>$out3"); my ($numPReadS,$numBaseS,$numPReadQ,$numBaseQ,$numBaseQA) = (0,0,0,0,0,0); my (@line1,@line2,$sumQ,$i,$templine,$templineN,$len,$len1,$len2,$Nnum,%read2rec); my ($line11,$line12,$line13,$line14,$line21,$line22,$line23,$line24); while(defined($line11=<$FR1>)) { my ($n1,$n2) = ("a","b"); while(!($line11 =~ /^@/)) { print $line11; $line11=<$FR1>; } if($line11 =~ /^@([^\s]+)($headLink)1/) { #--it depends------------ $n1 = $1; } $line12 = <$FR1>; chomp($line12); $line13 = <$FR1>; $line14 = <$FR1>; chomp($line14); $line21 = <$FR2>; while(!($line21 =~ /^@/)) { print $line21; $line21=<$FR2>; } if($line21 =~ /^@([^\s]+)($headLink)2/) { #--it depends------------ $n2 = $1; } $line22 = <$FR2>; chomp($line22); $line23 = <$FR2>; $line24 = <$FR2>; chomp($line24); #if($n1 ne $n2) { # print "$line11\n$n1\n\n$line21\n$n2\n"; # exit; #} #----remove reads with 5% N and average quality low than required my $flag = 1; $numPReadS += 1; $len1 = length($line12); $len2 = length($line22) + $len1; $numBaseS += $len2; &NlowQ($line12,$line14,$asc2ps,\$flag,$delAQ,$Npercent); if($flag==1) { &NlowQ($line22,$line24,$asc2ps,\$flag,$delAQ,$Npercent); #----check duplicated reads if($flag==1) { &rmDuplicates($line12,$line22,\%read2rec,\$flag); #----trim the 3' bases if($flag==1) { my ($tlen,$tlenA) = (0,0); &trim3tail($line12,$line14,$asc2ps,\$flag,$trimQ,$minLen,\$tlen,\$tlenA); if($flag==1) { &trim3tail($line22,$line24,$asc2ps,\$flag,$trimQ,$minLen,\$tlen,\$tlenA); if($flag==1) { $numPReadQ += 1; $numBaseQ += $tlen; $numBaseQA += $tlenA; print $FW1 $line11; print $FW1 $line12,"\n"; print $FW1 $line13; print $FW1 $line14,"\n"; print $FW2 $line21; print $FW2 $line22,"\n"; print $FW2 $line23; print $FW2 $line24,"\n"; } else { #print "flag 0 in trim read2.\n"; } } else { #print "flag 0 in trim read1.\n"; } } else { #print "flag 0 in duplicates.\n"; } } else { #print "flag 0 in read2 N percent.\n"; } } else { #print "flag 0 in read1 N percent.\n"; } } close($FR1); close($FR2); close($FW1); close($FW2); my $ratioRead = $numPReadQ/$numPReadS; my $ratioBase = $numBaseQ/$numBaseS; print $FW3 "$in1\t$numPReadQ\t$numPReadS\t$ratioRead\t$numBaseQ\t$numBaseS\t$ratioBase\t$numBaseQA\n"; close($FW3); } sub trim3tail { my ($seq,$qua,$asc2ps,$flag,$trimQ,$minLen,$tlen,$tlenA) = @_; my $line1ta = $seq; my $line1tb = $qua; my $len = length($line1tb); $$tlenA += $len; my $temp = chop($line1tb); my $tempN = chop($line1ta); $len -= 1; while($asc2ps->{$temp}<$trimQ && $len>0) { $temp = chop($line1tb); $tempN = chop($line1ta); $len -= 1; } if($asc2ps->{$temp}>=$trimQ) { $line1tb = $line1tb.$temp; $line1ta = $line1ta.$tempN; $len += 1; } if($len<$minLen) { $$flag = 0; } else { $$tlen += $len; } } sub rmDuplicates { my ($line1,$line2,$read2rec,$flag) = @_; my $subtemp1 = $line1."-".$line2; my $temp1re = reverse($line1); #only removing duplicated reads from one cluster $temp1re =~ tr/ATCGatcg/TAGCtagc/; my $temp2re = reverse($line2); $temp2re =~ tr/ATCGatcg/TAGCtagc/; my $subtemp2 = $temp2re."-".$temp1re; if(!exists($read2rec->{$subtemp1}) && !exists($read2rec->{$subtemp2})){ $read2rec->{$subtemp1} = "y"; $read2rec->{$subtemp2} = "y"; } else { $$flag = 0; } } sub NlowQ { my ($seq,$qua,$asc2ps,$flag,$delAQ,$Npercent) = @_; my $line1ta = $seq; my $line1tb = $qua; my $len = length($line1tb); my $len1 = length($line1tb); my $temp = chop($line1tb); my $sumQ = $asc2ps->{$temp}; my $tempN = chop($line1ta); my $sumN = 0; if($tempN eq "N") { $sumN += 1; } $len -= 1; while($len>0) { $temp = chop($line1tb); $sumQ += $asc2ps->{$temp}; $tempN = chop($line1ta); if($tempN eq "N") { $sumN += 1; } $len -= 1; #length($line1[3]); } if($sumQ/$len1<$delAQ || $sumN/$len1>$Npercent) { $$flag = 0; } } sub readSam { my ($in,$sam2fq) = @_; open(my $FR,$in); while($_=<$FR>) { chomp; my @temp = split(/\t/); $sam2fq->{$temp[0]} = $temp[1]; } close($FR); } sub readFile { my ($file,$file2rec,$dir) = @_; opendir(DIR,$dir) || die "Error in opening dir $dir\n"; my $filename; while(($filename = readdir(DIR))){ if($filename=~/^(.+)_\d\.fq.*gz/) { #it depends--- _hb_1.fq $file->{$1} = "Y"; $file2rec->{$filename} = "Y"; } } closedir(DIR); }