I am writing this post to help researchers trying to analyse their own RRBS data. This is non-technical explanation. Just following the steps may help on any ubuntu/linux system.
Requirements on the computer:
- Bowtie (Manual and Source available from here)
- Human genome in fasta format (Download hg19.2bit from here)
- Cutadapt (Manual and installation instructions are detailed here)
- Trim galore (Perl script is available here)
- Bismark (Excellent resource is available here)
- Methylkit (useful R package for downstream analysis on this google code page)
- Install all the above scripts/programs in the directories in your PATH or export it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
#This programme is intended to run all the preliminary rrbs steps starting from Trimming, Mapping to Lambda, Mapping to human genome, methylation extraction, .bed file making and 5x, 10x coverage filtering. | |
#This program expects all the files to be present in one folder and they end with '.fastq'. So make sure that no other '.fastq' file exists in the folder. | |
#Collects the list of files to be analysed | |
@files = `ls | grep ".fastq\$"`; | |
$num_files = scalar@files; | |
print "Files to be analysed for the RRBS are the following $num_files files:\n##########\n@files############\n"; | |
#performs the RRBS analysis for each file | |
foreach $trim_file(@files) { | |
chomp; | |
#Quality and Adapter trimming is performed | |
trim_galore ($trim_file); | |
@file_words = split(/\./, $trim_file); | |
$root = @file_words[0]; #extracts the file stub to be used | |
$trimmed_file = "$root"."_trimmed.fq"; #print "$trimmed_file\n"; | |
#Performs the removal of endfilled two nucleotides in a full length sequence (50bp single end read) | |
rm_endfilled_c ($trimmed_file, $root); | |
$final_file = "$root"."_trim.fastq"; | |
$lambda = "$root"."_lam"; | |
$hg19 = "$root"."_hg19"; | |
$remove = "$final_file"."_C_to_T.fastq"; | |
#performs mapping with bismark first with lambda genome and next with hg19 (previously prepared by bismark) | |
bismark_lam ($final_file, $lambda); | |
bismark_hg19 ($final_file, $hg19); | |
system ("rm $remove");#removes the temporary CT converted file | |
$path = `pwd`; #print "$path\n"; | |
chomp $path; | |
$dir = "$path"."/"."$hg19"; #print "$path\n";#changes directory into the bismark result | |
chdir ($dir)|| die "fuck the $dir\n"; | |
methylation_extractor_bedmake ($root); | |
chdir ($path) || die "Can not move back to $path\n"; | |
} | |
open (REP, ">bismark_report.txt"); | |
print REP "======================= Extracting the report for lambda mapping ======================\n"; | |
system("mkdir bismark_reports"); | |
chdir ("bismark_reports"); | |
system ("cp ../*lam/*Bismark_mapping_report.txt ."); | |
@files = `ls`; | |
extract_bismark_mapping_report(@files); | |
print REP "======================= Extracting the report for hg19 mapping ======================\n"; | |
system ("cp ../*hg19/*Bismark_mapping_report.txt ."); | |
@files = `ls`; | |
extract_bismark_mapping_report(@files); | |
chdir ($path); | |
system ("rm -r bismark_reports"); | |
merge_strand_coverage (@files, $dir); | |
convert_combinedBED_to_methylKitformat(@files); | |
sub convert_combinedBED_to_methylKitformat { | |
foreach $root (@files) { | |
$root =~ s/_trim.fastq_Bismark_mapping_report.txt//; | |
$file = $root."_CpG_combindcov_5x.bed"; #print "$file\n"; | |
print ("Converting $root bed file into Methylkit format \n"); | |
$output = $root."_CpG_combindcov_5x.txt"; | |
open (OU, ">$output"); | |
print OU ("chrBase\tchr\tbase\tstrand\tcoverage\tfreqC\tfreqT\n"); | |
convert ($file); | |
close OU; | |
} | |
} | |
sub convert { | |
open (FI, $file); | |
while (<FI>) { | |
@line = split (/\t/, $_); | |
@cov = split (/[(:+)]/, $line[3]); #print "@cov\n"; | |
$per_T = 100 - $line[4]; | |
print OU ("$line[0].$line[2]\t$line[0]\t$line[2]\tF\t$cov[1]\t$line[4]\t$per_T\n"); | |
} | |
} | |
sub merge_strand_coverage { | |
foreach $root (@files) { | |
$root =~ s/_trim.fastq_Bismark_mapping_report.txt\n//; | |
chdir ($dir); | |
$otfile=$root."_OT_cpg_1x.bed"; | |
$obfile=$root."_OB_cpg_1x.bed"; | |
print "Merging the coverage information from both strands\n"; | |
%othash = create_hash ($otfile); | |
%obhash = create_hash ($obfile); | |
$outfile = $root."_CpG_combindcov_5x.bed"; | |
open (OUT, ">tmp"); | |
foreach $key (keys %obhash) { | |
if (exists $othash{$key}) { #checks if the corresponding CpG info exists for the plus strand | |
($ot_totalcov, $ot_met,$ot_nonmet,$ot_cpgpos) = getcpginfo ($othash{$key}); #extracts the coverage info from + strand | |
($ob_totalcov, $ob_met,$ob_nonmet,$ob_cpgpos) = getcpginfo ($obhash{$key}); #extracts the coverage info from - strand | |
# print "$ot_totalcov $ot_met $ot_nonmet $ot_cpgpos $ob_totalcov $ob_met $ob_nonmet $ob_cpgpos\n"; | |
$new_met = $ot_met + $ob_met; #Calculates the new coverage info | |
$new_nonmet = $ot_nonmet + $ob_nonmet; | |
$new_totalcov = $ot_totalcov + $ob_totalcov; | |
# print "$new_met\t$new_nonmet\t$new_totalcov\n"; | |
$meth_percentage = $new_met*100/$new_totalcov; | |
if ($meth_percentage <= 20) { $rgb = "124,252,0";} # gives the color lawn green | |
elsif ($meth_percentage <= 40 ) { $rgb = "105,105,105";} #color Sienna | |
elsif ($meth_percentage <= 60 ) { $rgb = "0,0,255";} #color Blue | |
elsif ($meth_percentage <= 80 ) { $rgb = "0,0,0";} #color Black | |
elsif ($meth_percentage > 80) { $rgb = "255,0,0";} #color Red | |
@newline = split (/\t/, $othash{$key}); | |
$chr = $newline[0]; | |
$pos1 = $newline[1]; | |
$pos2 = $newline[2]; | |
$newcpg = "CpG-".$pos2."(".$new_totalcov.":".$new_met."+".$new_nonmet.")"; | |
if ($new_totalcov >=5) { | |
print OUT"$chr\t$pos1\t$pos2\t$newcpg\t$meth_percentage\t+\t0\t0\t$rgb\n"; #prints the merged coverage info | |
} | |
} | |
elsif (!exists $othash{$key}) {#If the corresponding coverage info doesnt exists for the +strand | |
@newline = split (/\t/, $obhash{$key}); | |
$chr = $newline[0]; | |
$pos1 = $newline[1]-1;#converts the co-ordinates into + strand co-ordinates | |
$pos2 = $newline[2]-1; | |
($ob_totalcov, $ob_met,$ob_nonmet,$ob_cpgpos) = getcpginfo ($obhash{$key}); | |
$newcpg = "CpG-".$pos2."(".$ob_totalcov.":".$ob_met."+".$ob_nonmet.")"; #modifies the - strand cpg info into +strand info | |
$rgb = $newline[8]; | |
if ($ob_totalcov >=5) { | |
print OUT"$chr\t$pos1\t$pos2\t$newcpg\t$newline[4]\t+\t0\t0\t$rgb";#prints the -strand info as a +strand info | |
} | |
} | |
} | |
foreach $key (keys %othash) { | |
if(!exists $obhash{$key}) { | |
($ot_totalcov, $ot_met,$ot_nonmet,$ot_cpgpos) = getcpginfo ($othash{$key}); | |
if($ot_totalcov>=5) { | |
print OUT $othash{$key}; | |
} | |
} | |
} | |
close OUT; | |
system ("sort -k1,1 -k2,2n tmp > $outfile; rm tmp"); | |
} | |
} | |
sub create_hash { #subroutine for creating hash | |
$file = shift @_; | |
open (IN, $file); | |
%hash = (); | |
while (<IN>) { | |
$line = $_; | |
$cpgpos = getcpginfo($line); | |
if ($file =~ /OT/) { | |
$key = $chr."_"."$cpgpos "; | |
} | |
elsif ($file =~ /OB/) { | |
$new_cpgpos = $cpgpos-1; | |
$key = $chr."_"."$new_cpgpos "; | |
} | |
$hash{$key} = $line; | |
} | |
return %hash; | |
} | |
sub getcpginfo { #subroutine for getting cpginfo | |
@line = split (/\t/, shift @_); | |
$chr = $line[0]; | |
$line[3] =~ /CpG-(\d+)\((\d+):(\d+)\+(\d+)\)/; | |
$cpgpos = $1; | |
$totalcov = $2; | |
$met = $3; | |
$nonmet = $4; | |
return ($totalcov,$met,$nonmet,$cpgpos); | |
} | |
sub extract_bismark_mapping_report{ | |
print REP "Sample\tReads_analysed\tMapped_reads\tMap_eff\tCs_Anal\tMethyl Cs\tBisulf_convsn_eff(%)\n"; | |
foreach $file (@files) { | |
open (BIS, "$file") || print "File $file can not be opened. $!\n"; | |
@bismark = <BIS>; | |
$root = $file; | |
$root =~ s/_trim.fastq_Bismark_mapping_report.txt\n//g; | |
$total_seqs = analysed_seqs (@bismark); | |
$seqs = get_mapped_seqs(@bismark); | |
$map_eff = get_map_eff (@bismark); | |
$total_cs = total_c_analysed (@bismark); | |
($total_methC, $bis_eff) = c_to_convrsn_eff (@bismark); | |
print REP "$root\t$total_seqs\t$seqs\t$map_eff\t$total_cs\t$total_methC\t$bis_eff\n"; | |
system ("rm $file"); | |
} | |
} | |
sub analysed_seqs { | |
my @temp; | |
@temp = split (/\t/, $bismark[6]); | |
chomp $temp[1]; | |
return $temp[1]; | |
} | |
sub get_mapped_seqs { | |
my @temp; | |
@temp = split (/\t/, $bismark[7]); | |
chomp $temp[1]; | |
return $temp[1]; | |
} | |
sub get_map_eff { | |
my @temp; | |
@temp = split (/\t/, $bismark[8]); | |
chomp $temp[1]; | |
return $temp[1]; | |
} | |
sub total_c_analysed { | |
my @temp; | |
@temp = split (/\t/, $bismark[23]); | |
chomp $temp[1]; | |
return $temp[1]; | |
} | |
sub c_to_convrsn_eff { | |
my @temp26; my @temp27; my @temp28; my $total; my $eff; | |
@temp26 = split (/\t/, $bismark[25]); | |
@temp27 = split (/\t/, $bismark[26]); | |
@temp28 = split (/\t/, $bismark[27]); | |
chomp $temp26[1]; chomp $temp27[1]; chomp $temp28[1]; | |
$total = $temp26[1]+$temp27[1]+$temp28[1]; | |
$eff = ($total_cs-$total)*100/$total_cs; | |
return ($total, $eff); | |
} | |
sub methylation_extractor_bedmake { | |
$sam = "$root"."_trim.fastq_bismark.sam"; | |
system ("methylation_extractor -s --merge_non_CpG $sam"); | |
$cpg_OT_context = "CpG_OT_".$sam.".txt"; print "$cpg_OT_context\n"; | |
$cpg_OB_context = "CpG_OB_".$sam.".txt"; print "$cpg_OB_context\n"; | |
$otbed = "$root"."_OT_cpg_1x.bed"; print "$otbed\n"; | |
$obbed = "$root"."_OB_cpg_1x.bed"; print "$obbed\n"; | |
system ("genome_methylation_bismark2bedGraph_v4.pl --s --counts $cpg_OT_context > $otbed"); | |
system ("genome_methylation_bismark2bedGraph_v4.pl --s --counts $cpg_OB_context > $obbed"); | |
system ("rm *CTOT* *CTOB*"); | |
} | |
sub bismark_lam { | |
print "================================\nMapping the Data to lambda genome\n============================\n"; | |
system ("bismark_modifiedv0.7.5 -n 2 -l 51 /home/kalyankpy/dname/genome_lambda/ $final_file -o $lambda/"); | |
} | |
sub bismark_hg19 { | |
print "================================\nMapping the Data to hg19 genome\n==============================\n"; | |
system ("bismark_modifiedv0.7.5 -n 2 -l 51 /home/kalyankpy/dname/genome_hg19/ $final_file -o $hg19/"); | |
} | |
sub trim_galore { | |
system ("trim_galore_mod --rrbs --length 30 --fastqc $trim_file"); | |
} | |
#Removes the endfilled two nucleotides in a full length 51 bp read | |
sub rm_endfilled_c { | |
open (IN, " paste - - - - < $trimmed_file|"); | |
print "Trimming the last two terminal nucleotides in a full length read from $trimmed_file\n"; | |
$endfil_removed_file = "$root"."_trim.fastq"; | |
open (OUT, ">$endfil_removed_file"); | |
print ("Created $endfil_removed_file\n"); | |
while (<IN>) { | |
@elem = split (/\t/, $_); | |
$h = $elem[0]; | |
$s = $elem[1]; | |
$y = $elem[2]; | |
$q = $elem[3]; | |
if (length($s) == 51 && $s =~ /CTG$|TTG$/){ #checking for the full length read ending with the pattern. Modify the number if the sequencing is of different length | |
$s = substr($s,0,49); | |
$q = substr($q,0,49); | |
print OUT ("$h\n$s\n$y\n$q\n"); | |
} | |
else { | |
print OUT ("$h\n$s\n$y\n$q"); | |
} | |
} | |
close IN; | |
close OUT; | |
} |