Merge pull request #10 from yhoogstrate/regions

v2.3.0
yhoogstrate · Sep 13, 2018 · a3bca71 · a3bca71
2 parents 32f5726 + 244a508
commit a3bca71
Show file tree

Hide file tree

Showing 9 changed files with 74 additions and 9 deletions.
diff --git a/Changelog b/Changelog
@@ -1,3 +1,8 @@
+13-sept-2018:  Youri Hoogstrate
+	* v2.3.0 - Adds arguments -r/--region and -b/--bed-regions to target
+	  specific genomic regions. This way alternative loci can be
+	  excluded for instance.
+
 13-sept-2018:  Youri Hoogstrate
 	* v2.2.0 - Removes --roc argument and adds multiple statistics
 	   including ROC to the -s/--stats argument.

diff --git a/README.md b/README.md
@@ -1,7 +1,10 @@
 bam-lorenz-coverage
 ===================
 
-Generate Lorenz plots and Coverage plots directly from BAM files
+This is a free software package that very easily allows you to generate
+Lorenz plots and Coverage plots, directly from a BAM file. It can also
+output the tables as text documents so you can generate custom plots.
+There is also support to only analyse specific regions.
 
 Implemented in:
  * Python3 + Matplotlib + Pysam
@@ -19,6 +22,12 @@ $ python setup.py install
 $ bam-lorenz-coverage --help
 ```
 
+Possible issues:
+ - pysam is currently incompatible with python 3.7 - manual installation of pysam is still possible (git clone + python setup.py install)
+ - matplotlib depends on Tk but does not throw an error if it is missing during installation, only at runtime
+   * debian/ubuntu: sudo apt-get install python3-tk
+   * arch: pacman -Sy tk
+
 Usage:
 
 ```
@@ -27,13 +36,15 @@ Usage: bam-lorenz-coverage [OPTIONS] INPUT_ALIGNMENT_FILE
 Options:
   --version                  Show the version and exit.
   -l, --lorenz-table TEXT    Output table Lorenz-curve (for stdout use: -)
-  -x, --roc                  Output Lorenz-curve ROC to $lorenz_table.roc.txt
-                             [ requires --lorenz-table to be set to file ]
   -c, --coverage-table TEXT  Output table Coverage-graph (for stdout use: -)
   -L, --lorenz-svg TEXT      Output figure Lorenz-curve (SVG).
   -C, --coverage-svg TEXT    Output figure Coverage-graph (SVG).
+  -s, --stats TEXT           Output additional stats to text-file
+  -r, --region TEXT          Scan depth only in selected region <chr:from-to>
+                             (all positions: 1-based)
+  -b, --bed-regions TEXT     Scan depth only in selected positions or regions
+                             (BED file: start: 0-based & end: 1-based)
   --help                     Show this message and exit.
-
 ```
 
 The lowercase arguments (-l, -c) allow extraction of the raw data tables for custom plotting. The uppercase arguments (-L, -C) directly generate a plot. The implemented plot only contains one sample per plot. For multi-sample plots, use the column tables and your imagination.
diff --git a/bin/bam-lorenz-coverage b/bin/bam-lorenz-coverage
@@ -16,9 +16,11 @@ from blc.blc import bamlorenzcoverage
 @click.option('-L', '--lorenz-svg', nargs=1, help='Output figure Lorenz-curve (SVG).')
 @click.option('-C', '--coverage-svg', nargs=1, help='Output figure Coverage-graph (SVG).')
 @click.option('-s', '--stats', nargs=1, help='Output additional stats to text-file')
-def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats):
+@click.option('-r', '--region', nargs=1, help='Scan depth only in selected region <chr:from-to> (all positions: 1-based)')
+@click.option('-b', '--bed-regions', nargs=1, help='Scan depth only in selected positions or regions (BED file: start: 0-based & end: 1-based)')
+def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats, region, bed_regions):
     b = bamlorenzcoverage()
-    idx_observed, n = b.bam_file_to_idx(input_alignment_file)
+    idx_observed, n = b.bam_file_to_idx(input_alignment_file, region, bed_regions)
 
     if coverage_table or coverage_svg:
         cumulative_coverage_curves = b.estimate_cumulative_coverage_curves(idx_observed)

diff --git a/blc/__init__.py b/blc/__init__.py
@@ -5,7 +5,7 @@
 """[License: GNU General Public License v3 (GPLv3)]
 """
 
-__version_info__ = ('2', '2', '0')
+__version_info__ = ('2', '3', '0')
 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
 __author__ = 'Youri Hoogstrate'
 __homepage__ = 'https://github.com/yhoogstrate/bam-lorenz-coverage'

diff --git a/blc/blc.py b/blc/blc.py
@@ -13,7 +13,7 @@ class bamlorenzcoverage:
     def __init__(self):
         pass
 
-    def bam_file_to_idx(self, bam_file):
+    def bam_file_to_idx(self, bam_file, region=None, bed_regions=None):
         """
         Coverage plot needs the zero-statistic - i.e. the number of genomic bases not covered by reads
         """
@@ -28,8 +28,15 @@ def bam_file_to_idx(self, bam_file):
         tmp_filename = os.path.join(tempfile.mkdtemp() + '.fifo')
         os.mkfifo(tmp_filename)
 
+        cmd = ['-a', bam_file]
+        if region:
+            cmd = ['-r', region] + cmd
+        elif bed_regions:
+            cmd = ['-b', bed_regions] + cmd
+        # print(cmd)
+
         # I tried this with the Threading class but this often didnt parallelize
-        parallel_thread = Process(target=pysam.samtools.depth, args=['-a', bam_file], kwargs={'save_stdout': tmp_filename})
+        parallel_thread = Process(target=pysam.samtools.depth, args=cmd, kwargs={'save_stdout': tmp_filename})
         parallel_thread.start()
 
         fh = os.open(tmp_filename, os.O_RDONLY)

diff --git a/tests/blc/test_blc_012.sam b/tests/blc/test_blc_012.sam
@@ -0,0 +1 @@
+test_blc_011.sam
diff --git a/tests/blc/test_blc_013.bed b/tests/blc/test_blc_013.bed
@@ -0,0 +1 @@
+chr1	2	14
diff --git a/tests/blc/test_blc_013.sam b/tests/blc/test_blc_013.sam
@@ -0,0 +1 @@
+test_blc_011.sam
diff --git a/tests/test_class_blc.py b/tests/test_class_blc.py
@@ -226,6 +226,43 @@ def test_011_lorenz_03(self):
         self.assertEqual(lc['total_sequenced_bases'], 10)
         self.assertEqual(lc['total_covered_positions_of_genome'], 8)
 
+    def test_012_region(self):
+        # everything covered, is at least covered even densely
+        #             x x x x x
+        #       x x x x x
+        # - - - - - - - - - - - - - -
+
+        test_id = 'blc_012'
+
+        input_file_sam = TEST_DIR + "test_" + test_id + ".sam"
+        input_file_bam = T_TEST_DIR + "test_" + test_id + ".bam"
+
+        sam_to_sorted_bam(input_file_sam, input_file_bam)
+
+        b = bamlorenzcoverage()
+        idx, n = b.bam_file_to_idx(input_file_bam, 'chr1:2-14')
+
+        self.assertEqual(n, 13)  # sam header say reference size is 14, but we start at 2nd position
+
+    def test_013_bed(self):
+        # everything covered, is at least covered even densely
+        #             x x x x x
+        #       x x x x x
+        # - - - - - - - - - - - - - -
+
+        test_id = 'blc_013'
+
+        input_file_sam = TEST_DIR + "test_" + test_id + ".sam"
+        input_file_bed = TEST_DIR + "test_" + test_id + ".bed"
+        input_file_bam = T_TEST_DIR + "test_" + test_id + ".bam"
+
+        sam_to_sorted_bam(input_file_sam, input_file_bam)
+
+        b = bamlorenzcoverage()
+        idx, n = b.bam_file_to_idx(input_file_bam, None, input_file_bed)
+
+        self.assertEqual(n, 12)
+
 
 if __name__ == '__main__':
     main()