Skip to content

Commit

Permalink
Merge pull request #10 from yhoogstrate/regions
Browse files Browse the repository at this point in the history
v2.3.0
  • Loading branch information
yhoogstrate authored Sep 13, 2018
2 parents 32f5726 + 244a508 commit a3bca71
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 9 deletions.
5 changes: 5 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
13-sept-2018: Youri Hoogstrate
* v2.3.0 - Adds arguments -r/--region and -b/--bed-regions to target
specific genomic regions. This way alternative loci can be
excluded for instance.

13-sept-2018: Youri Hoogstrate
* v2.2.0 - Removes --roc argument and adds multiple statistics
including ROC to the -s/--stats argument.
Expand Down
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
bam-lorenz-coverage
===================

Generate Lorenz plots and Coverage plots directly from BAM files
This is a free software package that very easily allows you to generate
Lorenz plots and Coverage plots, directly from a BAM file. It can also
output the tables as text documents so you can generate custom plots.
There is also support to only analyse specific regions.

Implemented in:
* Python3 + Matplotlib + Pysam
Expand All @@ -19,6 +22,12 @@ $ python setup.py install
$ bam-lorenz-coverage --help
```

Possible issues:
- pysam is currently incompatible with python 3.7 - manual installation of pysam is still possible (git clone + python setup.py install)
- matplotlib depends on Tk but does not throw an error if it is missing during installation, only at runtime
* debian/ubuntu: sudo apt-get install python3-tk
* arch: pacman -Sy tk

Usage:

```
Expand All @@ -27,13 +36,15 @@ Usage: bam-lorenz-coverage [OPTIONS] INPUT_ALIGNMENT_FILE
Options:
--version Show the version and exit.
-l, --lorenz-table TEXT Output table Lorenz-curve (for stdout use: -)
-x, --roc Output Lorenz-curve ROC to $lorenz_table.roc.txt
[ requires --lorenz-table to be set to file ]
-c, --coverage-table TEXT Output table Coverage-graph (for stdout use: -)
-L, --lorenz-svg TEXT Output figure Lorenz-curve (SVG).
-C, --coverage-svg TEXT Output figure Coverage-graph (SVG).
-s, --stats TEXT Output additional stats to text-file
-r, --region TEXT Scan depth only in selected region <chr:from-to>
(all positions: 1-based)
-b, --bed-regions TEXT Scan depth only in selected positions or regions
(BED file: start: 0-based & end: 1-based)
--help Show this message and exit.
```

The lowercase arguments (-l, -c) allow extraction of the raw data tables for custom plotting. The uppercase arguments (-L, -C) directly generate a plot. The implemented plot only contains one sample per plot. For multi-sample plots, use the column tables and your imagination.
6 changes: 4 additions & 2 deletions bin/bam-lorenz-coverage
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ from blc.blc import bamlorenzcoverage
@click.option('-L', '--lorenz-svg', nargs=1, help='Output figure Lorenz-curve (SVG).')
@click.option('-C', '--coverage-svg', nargs=1, help='Output figure Coverage-graph (SVG).')
@click.option('-s', '--stats', nargs=1, help='Output additional stats to text-file')
def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats):
@click.option('-r', '--region', nargs=1, help='Scan depth only in selected region <chr:from-to> (all positions: 1-based)')
@click.option('-b', '--bed-regions', nargs=1, help='Scan depth only in selected positions or regions (BED file: start: 0-based & end: 1-based)')
def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats, region, bed_regions):
b = bamlorenzcoverage()
idx_observed, n = b.bam_file_to_idx(input_alignment_file)
idx_observed, n = b.bam_file_to_idx(input_alignment_file, region, bed_regions)

if coverage_table or coverage_svg:
cumulative_coverage_curves = b.estimate_cumulative_coverage_curves(idx_observed)
Expand Down
2 changes: 1 addition & 1 deletion blc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""[License: GNU General Public License v3 (GPLv3)]
"""

__version_info__ = ('2', '2', '0')
__version_info__ = ('2', '3', '0')
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
__author__ = 'Youri Hoogstrate'
__homepage__ = 'https://github.com/yhoogstrate/bam-lorenz-coverage'
Expand Down
11 changes: 9 additions & 2 deletions blc/blc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class bamlorenzcoverage:
def __init__(self):
pass

def bam_file_to_idx(self, bam_file):
def bam_file_to_idx(self, bam_file, region=None, bed_regions=None):
"""
Coverage plot needs the zero-statistic - i.e. the number of genomic bases not covered by reads
"""
Expand All @@ -28,8 +28,15 @@ def bam_file_to_idx(self, bam_file):
tmp_filename = os.path.join(tempfile.mkdtemp() + '.fifo')
os.mkfifo(tmp_filename)

cmd = ['-a', bam_file]
if region:
cmd = ['-r', region] + cmd
elif bed_regions:
cmd = ['-b', bed_regions] + cmd
# print(cmd)

# I tried this with the Threading class but this often didnt parallelize
parallel_thread = Process(target=pysam.samtools.depth, args=['-a', bam_file], kwargs={'save_stdout': tmp_filename})
parallel_thread = Process(target=pysam.samtools.depth, args=cmd, kwargs={'save_stdout': tmp_filename})
parallel_thread.start()

fh = os.open(tmp_filename, os.O_RDONLY)
Expand Down
1 change: 1 addition & 0 deletions tests/blc/test_blc_012.sam
1 change: 1 addition & 0 deletions tests/blc/test_blc_013.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr1 2 14
1 change: 1 addition & 0 deletions tests/blc/test_blc_013.sam
37 changes: 37 additions & 0 deletions tests/test_class_blc.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,43 @@ def test_011_lorenz_03(self):
self.assertEqual(lc['total_sequenced_bases'], 10)
self.assertEqual(lc['total_covered_positions_of_genome'], 8)

def test_012_region(self):
# everything covered, is at least covered even densely
# x x x x x
# x x x x x
# - - - - - - - - - - - - - -

test_id = 'blc_012'

input_file_sam = TEST_DIR + "test_" + test_id + ".sam"
input_file_bam = T_TEST_DIR + "test_" + test_id + ".bam"

sam_to_sorted_bam(input_file_sam, input_file_bam)

b = bamlorenzcoverage()
idx, n = b.bam_file_to_idx(input_file_bam, 'chr1:2-14')

self.assertEqual(n, 13) # sam header say reference size is 14, but we start at 2nd position

def test_013_bed(self):
# everything covered, is at least covered even densely
# x x x x x
# x x x x x
# - - - - - - - - - - - - - -

test_id = 'blc_013'

input_file_sam = TEST_DIR + "test_" + test_id + ".sam"
input_file_bed = TEST_DIR + "test_" + test_id + ".bed"
input_file_bam = T_TEST_DIR + "test_" + test_id + ".bam"

sam_to_sorted_bam(input_file_sam, input_file_bam)

b = bamlorenzcoverage()
idx, n = b.bam_file_to_idx(input_file_bam, None, input_file_bed)

self.assertEqual(n, 12)


if __name__ == '__main__':
main()

0 comments on commit a3bca71

Please sign in to comment.