BEDTools

From Dave's wiki
Jump to navigation Jump to search

Installing

git clone https://github.com/arq5x/bedtools2.git
cd bedtools2
#step for Ubuntu
#sudo apt-get install build-essential g++
make clean; make all
mkdir -p ~/bin
ln -s `pwd`/bin/* ~/bin
bedtools --version
bedtools v2.19.0-7-g4793c98

Creating hg19.genome

#get hg19 genome file from the UCSC Genome Browser
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo" > hg19.genome

Generating a random BED file

randomBed -n 10 -seed 123 -g hg19.genome
chr12	25375663	25375763	1	100	-
chr15	44488992	44489092	2	100	-
chr3	112583172	112583272	3	100	-
chr13	64998419	64998519	4	100	-
chr6	163661107	163661207	5	100	-
chr20	35840067	35840167	6	100	+
chr10	68752842	68752942	7	100	+
chr10	103411716	103411816	8	100	-
chr2	230672449	230672549	9	100	-
chr5	159528659	159528759	10	100	-

randomBed -n 10 -seed 123 -g ~/genome/hg19.genome | sortBed -chrThenScoreA
chr10	68752842	68752942	7	100	+
chr10	103411716	103411816	8	100	-
chr12	25375663	25375763	1	100	-
chr13	64998419	64998519	4	100	-
chr15	44488992	44489092	2	100	-
chr2	230672449	230672549	9	100	-
chr20	35840067	35840167	6	100	+
chr3	112583172	112583272	3	100	-
chr5	159528659	159528759	10	100	-
chr6	163661107	163661207	5	100	-

Nucleotide frequency

#nucBed (no documentation yet at https://bedtools.readthedocs.org/en/latest/content/tools/nuc.html)
#Profiles the nucleotide content of intervals in a fasta file
randomBed -n 10 -seed 123 -g ~/genome/hg19.genome > test.bed
nucBed -fi ~/genome/hg19/hg19.fa -bed test.bed 
#1_usercol	2_usercol	3_usercol	4_usercol	5_usercol	6_usercol	7_pct_at	8_pct_gc	9_num_A	10_num_C	11_num_G	12_num_T	13_num_N	14_num_oth 15_seq_len	
chr12	25375663	25375763	1	100	-	0.720000	0.280000	38	12	16	34	0	0	100
chr15	44488992	44489092	2	100	-	0.610000	0.390000	20	23	16	41	0	0	100
chr3	112583172	112583272	3	100	-	0.660000	0.340000	21	12	22	45	0	0	100
chr13	64998419	64998519	4	100	-	0.700000	0.300000	29	16	14	41	0	0	100
chr6	163661107	163661207	5	100	-	0.340000	0.660000	19	43	23	15	0	0	100
chr20	35840067	35840167	6	100	+	0.760000	0.240000	26	10	14	50	0	0	100
chr10	68752842	68752942	7	100	+	0.510000	0.490000	28	24	25	23	0	0	100
chr10	103411716	103411816	8	100	-	0.500000	0.500000	29	31	19	21	0	0	100
chr2	230672449	230672549	9	100	-	0.530000	0.470000	25	30	17	28	0	0	100
chr5	159528659	159528759	10	100	-	0.700000	0.300000	37	15	15	33	0	0	100

Create genomic intervals

#windowMaker
#Divide the human genome into windows of 1MB
#store only chromosome 1
windowMaker -g ~/genome/hg19.genome -w 1000000 | grep "^chr1\b" > chr1_1000_mb.bed

Find the closest elements

#closestBed
#for each feature in A, finds the closest 
#feature (upstream or downstream) in B.
cat a.bed
chr1    100     500     a
cat b.bed
chr1    2000    3000    b
#-d      In addition to the closest feature in B, 
#        report its distance to A as an extra column.
#        - The reported distance for overlapping features will be 0.
bedtools closest -d -a a.bed -b b.bed
chr1    100     500     a       chr1    2000    3000    b       1501
#this can be used with finding closest features that are on the opposite strand
bedtools closest -d -S -a a.bed -b b.bed
#for overlapping features, d = 0
cat c.bed 
chr1    499     1000    c
bedtools closest -a a.bed -b c.bed -d
chr1    100     500     a       chr1    499     1000    c       0

Finding sense-antisense pairs

cat a.bed 
chr1	10	20	id1	0	+
chr1	21	30	id2	0	+
chr1	31	40	id3	0	+
chr1	41	50	id4	0	+
chr1	51	60	id5	0	+
cat b.bed
chr1	15	25	id1	0	+
chr1	26	35	id2	0	+
chr1	36	45	id3	0	-
chr1	46	55	id4	0	+
chr1	56	65	id5	0	+
#there is one sense-antisense pair
intersectBed -S -a a.bed -b b.bed -wo
chr1	31	40	id3	0	+	chr1	36	45	id3	0	-	4
chr1	41	50	id4	0	+	chr1	36	45	id3	0	-	4

Map

Multiple intersections

cat test.bed 
chr1    10      20
chr1    12      15
chr2    10      20

cat test1.bed 
chr1    10      20
chr3    10      20

cat test2.bed 
chr1    10      20
chr4    10      20

bedtools multiinter -i test.bed test1.bed test2.bed 
chr1    10      20      3       1,2,3   1       1       1
chr2    10      20      1       1       1       0       0
chr3    10      20      1       2       0       1       0
chr4    10      20      1       3       0       0       1

Calculating extent of overlap

Using bedtools jaccard (http://bedtools.readthedocs.org/en/latest/content/tools/jaccard.html)

cat a.bed 
chr1    10      20
cat b.bed
chr1    15      25

bedtools jaccard -a a.bed -b b.bed
intersection    union-intersection      jaccard n_intersections
5       15      0.333333        1
#two features
cat a.bed 
chr1    10      20
chr1    100     200
cat b.bed
chr1    15      25
chr1    180     220
bedtools jaccard -a a.bed -b b.bed 
intersection    union-intersection      jaccard n_intersections
25      135     0.185185        2

Coverage

See http://bedtools.readthedocs.org/en/latest/content/tools/coverage.html

# bedtools coverage [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>
cat a.bed 
chr1    1       100     a       0
# second BED file
cat b.bed 
chr1    1       25      b       0
chr1    26      50      b       0
chr1    51      75      b       0
chr1    76      100     b       0
# coverage
bedtools coverage -a a.bed -b b.bed 
chr1    1       100     a       0       4       96      99      0.9696970

Notice that it's not 100%; this is because BED files have 0-based starts (see genome coordinates page).

cat a.bed
chr1    0       100     a       0
cat b.bed
chr1    0       25      b       0
chr1    0       25      b       0
chr1    25      50      b       0
chr1    25      50      b       0
chr1    50      75      b       0
chr1    50      75      b       0
chr1    75      100     b       0
chr1    75      100     b       0

bedtools coverage -a a.bed -b b.bed 
chr1    0       100     a       0       8       100     100     1.0000000

For per base coverage use the -d parameter

cat a.bed
chr1    0       100
cat b.bed 
chr1    0       1       b       0
chr1    1       2       b       0
chr1    1       2       b       0
chr1    2       3       b       0
chr1    2       3       b       0
chr1    2       3       b       0
chr1    3       4       b       0
chr1    3       4       b       0
chr1    3       4       b       0
chr1    3       4       b       0

bedtools coverage -a a.bed -b b.bed -d | head
chr1    0       100     a       0       1       1
chr1    0       100     a       0       2       2
chr1    0       100     a       0       3       3
chr1    0       100     a       0       4       4
chr1    0       100     a       0       5       0
chr1    0       100     a       0       6       0
chr1    0       100     a       0       7       0
chr1    0       100     a       0       8       0
chr1    0       100     a       0       9       0
chr1    0       100     a       0       10      0

Random sequences

These are some Perl scripts that I wrote for generating random sequences that may be useful for testing (but are not related to BEDTools).

mkdir -p ~/gist
# script for subsetting a fasta file
git clone https://gist.github.com/23a21e26d7ab695113af.git ~/gist/subset_fasta
# script to generate a random fasta file
git clone https://gist.github.com/3e9c8ae57eded71c84de.git ~/gist/random_fasta
# generate a 100 bp random fasta file with seed 31
~/gist/random_fasta/generate_random_seq.pl 100 31 > 100.fa
# subset the fasta file with 20 reads of length 100
~/gist/subset_fasta/subset_random_seq.pl 100.fa 20 100 > 20_100.fq