From 3022eab0854992fb40894c7b1dd0feca8400b2fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ann-Kathrin=20Br=C3=BCggemann?= <90249112+AKBrueggemann@users.noreply.github.com> Date: Wed, 2 Feb 2022 10:46:56 +0100 Subject: [PATCH] feat: rule and script bed_to_bedpe (#447) * Rule and script bed_to_bedpe * Added bed to config, changed rule input paths * Bed files can be downloaded via url in config * fmt --- .tests/config/config.yaml | 2 +- .tests/resources/nCoV-2019.primer.bed | 218 +++++++++++++++++++++++ config/config.yaml | 4 +- resources/nCoV-2019.primer.bed | 218 +++++++++++++++++++++++ workflow/rules/common.smk | 11 ++ workflow/rules/read_clipping.smk | 19 +- workflow/scripts/bed-to-bedpe.py | 27 +++ workflow/scripts/plot-primer-clipping.py | 2 +- 8 files changed, 494 insertions(+), 7 deletions(-) create mode 100644 .tests/resources/nCoV-2019.primer.bed create mode 100644 resources/nCoV-2019.primer.bed create mode 100644 workflow/scripts/bed-to-bedpe.py diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml index 24aaa2d61..f7acb92b9 100644 --- a/.tests/config/config.yaml +++ b/.tests/config/config.yaml @@ -65,7 +65,7 @@ preprocessing: # for more information artic-primer-version: 3 # path to amplicon primers in bedpe format for hard-clipping on paired end files (illumina) - amplicon-primers: "resources/primer.bedpe" + amplicon-primers: "resources/nCoV-2019.primer.bed" # GenBank accession of reference sequence of the amplicon primers amplicon-reference: "MN908947" diff --git a/.tests/resources/nCoV-2019.primer.bed b/.tests/resources/nCoV-2019.primer.bed new file mode 100644 index 000000000..76691de9a --- /dev/null +++ b/.tests/resources/nCoV-2019.primer.bed @@ -0,0 +1,218 @@ +MN908947.3 30 54 nCoV-2019_1_LEFT 1 + +MN908947.3 385 410 nCoV-2019_1_RIGHT 1 - +MN908947.3 320 342 nCoV-2019_2_LEFT 2 + +MN908947.3 704 726 nCoV-2019_2_RIGHT 2 - +MN908947.3 642 664 nCoV-2019_3_LEFT 1 + +MN908947.3 1004 1028 nCoV-2019_3_RIGHT 1 - +MN908947.3 943 965 nCoV-2019_4_LEFT 2 + +MN908947.3 1312 1337 nCoV-2019_4_RIGHT 2 - +MN908947.3 1242 1264 nCoV-2019_5_LEFT 1 + +MN908947.3 1623 1651 nCoV-2019_5_RIGHT 1 - +MN908947.3 1573 1595 nCoV-2019_6_LEFT 2 + +MN908947.3 1942 1964 nCoV-2019_6_RIGHT 2 - +MN908947.3 1875 1897 nCoV-2019_7_LEFT 1 + +MN908947.3 1868 1890 nCoV-2019_7_LEFT_alt0 1 + +MN908947.3 2247 2269 nCoV-2019_7_RIGHT 1 - +MN908947.3 2242 2264 nCoV-2019_7_RIGHT_alt5 1 - +MN908947.3 2181 2205 nCoV-2019_8_LEFT 2 + +MN908947.3 2568 2592 nCoV-2019_8_RIGHT 2 - +MN908947.3 2505 2529 nCoV-2019_9_LEFT 1 + +MN908947.3 2504 2528 nCoV-2019_9_LEFT_alt4 1 + +MN908947.3 2882 2904 nCoV-2019_9_RIGHT 1 - +MN908947.3 2880 2902 nCoV-2019_9_RIGHT_alt2 1 - +MN908947.3 2826 2850 nCoV-2019_10_LEFT 2 + +MN908947.3 3183 3210 nCoV-2019_10_RIGHT 2 - +MN908947.3 3144 3166 nCoV-2019_11_LEFT 1 + +MN908947.3 3507 3531 nCoV-2019_11_RIGHT 1 - +MN908947.3 3460 3482 nCoV-2019_12_LEFT 2 + +MN908947.3 3826 3853 nCoV-2019_12_RIGHT 2 - +MN908947.3 3771 3795 nCoV-2019_13_LEFT 1 + +MN908947.3 4142 4164 nCoV-2019_13_RIGHT 1 - +MN908947.3 4054 4077 nCoV-2019_14_LEFT 2 + +MN908947.3 4044 4068 nCoV-2019_14_LEFT_alt4 2 + +MN908947.3 4428 4450 nCoV-2019_14_RIGHT 2 - +MN908947.3 4402 4424 nCoV-2019_14_RIGHT_alt2 2 - +MN908947.3 4294 4321 nCoV-2019_15_LEFT 1 + +MN908947.3 4296 4322 nCoV-2019_15_LEFT_alt1 1 + +MN908947.3 4674 4696 nCoV-2019_15_RIGHT 1 - +MN908947.3 4666 4689 nCoV-2019_15_RIGHT_alt3 1 - +MN908947.3 4636 4658 nCoV-2019_16_LEFT 2 + +MN908947.3 4995 5017 nCoV-2019_16_RIGHT 2 - +MN908947.3 4939 4966 nCoV-2019_17_LEFT 1 + +MN908947.3 5296 5321 nCoV-2019_17_RIGHT 1 - +MN908947.3 5230 5259 nCoV-2019_18_LEFT 2 + +MN908947.3 5257 5287 nCoV-2019_18_LEFT_alt2 2 + +MN908947.3 5620 5644 nCoV-2019_18_RIGHT 2 - +MN908947.3 5620 5643 nCoV-2019_18_RIGHT_alt1 2 - +MN908947.3 5563 5586 nCoV-2019_19_LEFT 1 + +MN908947.3 5932 5957 nCoV-2019_19_RIGHT 1 - +MN908947.3 5867 5894 nCoV-2019_20_LEFT 2 + +MN908947.3 6247 6272 nCoV-2019_20_RIGHT 2 - +MN908947.3 6167 6196 nCoV-2019_21_LEFT 1 + +MN908947.3 6168 6197 nCoV-2019_21_LEFT_alt2 1 + +MN908947.3 6528 6550 nCoV-2019_21_RIGHT 1 - +MN908947.3 6526 6548 nCoV-2019_21_RIGHT_alt0 1 - +MN908947.3 6466 6495 nCoV-2019_22_LEFT 2 + +MN908947.3 6846 6873 nCoV-2019_22_RIGHT 2 - +MN908947.3 6718 6745 nCoV-2019_23_LEFT 1 + +MN908947.3 7092 7117 nCoV-2019_23_RIGHT 1 - +MN908947.3 7035 7058 nCoV-2019_24_LEFT 2 + +MN908947.3 7389 7415 nCoV-2019_24_RIGHT 2 - +MN908947.3 7305 7332 nCoV-2019_25_LEFT 1 + +MN908947.3 7671 7694 nCoV-2019_25_RIGHT 1 - +MN908947.3 7626 7651 nCoV-2019_26_LEFT 2 + +MN908947.3 7997 8019 nCoV-2019_26_RIGHT 2 - +MN908947.3 7943 7968 nCoV-2019_27_LEFT 1 + +MN908947.3 8319 8341 nCoV-2019_27_RIGHT 1 - +MN908947.3 8249 8275 nCoV-2019_28_LEFT 2 + +MN908947.3 8635 8661 nCoV-2019_28_RIGHT 2 - +MN908947.3 8595 8619 nCoV-2019_29_LEFT 1 + +MN908947.3 8954 8983 nCoV-2019_29_RIGHT 1 - +MN908947.3 8888 8913 nCoV-2019_30_LEFT 2 + +MN908947.3 9245 9271 nCoV-2019_30_RIGHT 2 - +MN908947.3 9204 9226 nCoV-2019_31_LEFT 1 + +MN908947.3 9557 9585 nCoV-2019_31_RIGHT 1 - +MN908947.3 9477 9502 nCoV-2019_32_LEFT 2 + +MN908947.3 9834 9858 nCoV-2019_32_RIGHT 2 - +MN908947.3 9784 9806 nCoV-2019_33_LEFT 1 + +MN908947.3 10146 10171 nCoV-2019_33_RIGHT 1 - +MN908947.3 10076 10099 nCoV-2019_34_LEFT 2 + +MN908947.3 10437 10459 nCoV-2019_34_RIGHT 2 - +MN908947.3 10362 10384 nCoV-2019_35_LEFT 1 + +MN908947.3 10737 10763 nCoV-2019_35_RIGHT 1 - +MN908947.3 10666 10688 nCoV-2019_36_LEFT 2 + +MN908947.3 11048 11074 nCoV-2019_36_RIGHT 2 - +MN908947.3 10999 11022 nCoV-2019_37_LEFT 1 + +MN908947.3 11372 11394 nCoV-2019_37_RIGHT 1 - +MN908947.3 11306 11331 nCoV-2019_38_LEFT 2 + +MN908947.3 11668 11693 nCoV-2019_38_RIGHT 2 - +MN908947.3 11555 11584 nCoV-2019_39_LEFT 1 + +MN908947.3 11927 11949 nCoV-2019_39_RIGHT 1 - +MN908947.3 11863 11889 nCoV-2019_40_LEFT 2 + +MN908947.3 12234 12256 nCoV-2019_40_RIGHT 2 - +MN908947.3 12110 12133 nCoV-2019_41_LEFT 1 + +MN908947.3 12465 12490 nCoV-2019_41_RIGHT 1 - +MN908947.3 12417 12439 nCoV-2019_42_LEFT 2 + +MN908947.3 12779 12802 nCoV-2019_42_RIGHT 2 - +MN908947.3 12710 12732 nCoV-2019_43_LEFT 1 + +MN908947.3 13074 13096 nCoV-2019_43_RIGHT 1 - +MN908947.3 13005 13027 nCoV-2019_44_LEFT 2 + +MN908947.3 13007 13029 nCoV-2019_44_LEFT_alt3 2 + +MN908947.3 13378 13400 nCoV-2019_44_RIGHT 2 - +MN908947.3 13363 13385 nCoV-2019_44_RIGHT_alt0 2 - +MN908947.3 13319 13344 nCoV-2019_45_LEFT 1 + +MN908947.3 13307 13336 nCoV-2019_45_LEFT_alt2 1 + +MN908947.3 13669 13699 nCoV-2019_45_RIGHT 1 - +MN908947.3 13660 13689 nCoV-2019_45_RIGHT_alt7 1 - +MN908947.3 13599 13621 nCoV-2019_46_LEFT 2 + +MN908947.3 13602 13625 nCoV-2019_46_LEFT_alt1 2 + +MN908947.3 13962 13984 nCoV-2019_46_RIGHT 2 - +MN908947.3 13961 13984 nCoV-2019_46_RIGHT_alt2 2 - +MN908947.3 13918 13946 nCoV-2019_47_LEFT 1 + +MN908947.3 14271 14299 nCoV-2019_47_RIGHT 1 - +MN908947.3 14207 14232 nCoV-2019_48_LEFT 2 + +MN908947.3 14579 14601 nCoV-2019_48_RIGHT 2 - +MN908947.3 14545 14570 nCoV-2019_49_LEFT 1 + +MN908947.3 14898 14926 nCoV-2019_49_RIGHT 1 - +MN908947.3 14865 14895 nCoV-2019_50_LEFT 2 + +MN908947.3 15224 15246 nCoV-2019_50_RIGHT 2 - +MN908947.3 15171 15193 nCoV-2019_51_LEFT 1 + +MN908947.3 15538 15560 nCoV-2019_51_RIGHT 1 - +MN908947.3 15481 15503 nCoV-2019_52_LEFT 2 + +MN908947.3 15861 15886 nCoV-2019_52_RIGHT 2 - +MN908947.3 15827 15851 nCoV-2019_53_LEFT 1 + +MN908947.3 16186 16209 nCoV-2019_53_RIGHT 1 - +MN908947.3 16118 16144 nCoV-2019_54_LEFT 2 + +MN908947.3 16485 16510 nCoV-2019_54_RIGHT 2 - +MN908947.3 16416 16444 nCoV-2019_55_LEFT 1 + +MN908947.3 16804 16833 nCoV-2019_55_RIGHT 1 - +MN908947.3 16748 16770 nCoV-2019_56_LEFT 2 + +MN908947.3 17130 17152 nCoV-2019_56_RIGHT 2 - +MN908947.3 17065 17087 nCoV-2019_57_LEFT 1 + +MN908947.3 17430 17452 nCoV-2019_57_RIGHT 1 - +MN908947.3 17381 17406 nCoV-2019_58_LEFT 2 + +MN908947.3 17738 17761 nCoV-2019_58_RIGHT 2 - +MN908947.3 17674 17697 nCoV-2019_59_LEFT 1 + +MN908947.3 18036 18062 nCoV-2019_59_RIGHT 1 - +MN908947.3 17966 17993 nCoV-2019_60_LEFT 2 + +MN908947.3 18324 18348 nCoV-2019_60_RIGHT 2 - +MN908947.3 18253 18275 nCoV-2019_61_LEFT 1 + +MN908947.3 18650 18672 nCoV-2019_61_RIGHT 1 - +MN908947.3 18596 18618 nCoV-2019_62_LEFT 2 + +MN908947.3 18957 18979 nCoV-2019_62_RIGHT 2 - +MN908947.3 18896 18918 nCoV-2019_63_LEFT 1 + +MN908947.3 19275 19297 nCoV-2019_63_RIGHT 1 - +MN908947.3 19204 19232 nCoV-2019_64_LEFT 2 + +MN908947.3 19591 19616 nCoV-2019_64_RIGHT 2 - +MN908947.3 19548 19570 nCoV-2019_65_LEFT 1 + +MN908947.3 19911 19939 nCoV-2019_65_RIGHT 1 - +MN908947.3 19844 19866 nCoV-2019_66_LEFT 2 + +MN908947.3 20231 20255 nCoV-2019_66_RIGHT 2 - +MN908947.3 20172 20200 nCoV-2019_67_LEFT 1 + +MN908947.3 20542 20572 nCoV-2019_67_RIGHT 1 - +MN908947.3 20472 20496 nCoV-2019_68_LEFT 2 + +MN908947.3 20867 20890 nCoV-2019_68_RIGHT 2 - +MN908947.3 20786 20813 nCoV-2019_69_LEFT 1 + +MN908947.3 21146 21169 nCoV-2019_69_RIGHT 1 - +MN908947.3 21075 21104 nCoV-2019_70_LEFT 2 + +MN908947.3 21427 21455 nCoV-2019_70_RIGHT 2 - +MN908947.3 21357 21386 nCoV-2019_71_LEFT 1 + +MN908947.3 21716 21743 nCoV-2019_71_RIGHT 1 - +MN908947.3 21658 21682 nCoV-2019_72_LEFT 2 + +MN908947.3 22013 22038 nCoV-2019_72_RIGHT 2 - +MN908947.3 21961 21990 nCoV-2019_73_LEFT 1 + +MN908947.3 22324 22346 nCoV-2019_73_RIGHT 1 - +MN908947.3 22262 22290 nCoV-2019_74_LEFT 2 + +MN908947.3 22626 22650 nCoV-2019_74_RIGHT 2 - +MN908947.3 22516 22542 nCoV-2019_75_LEFT 1 + +MN908947.3 22877 22903 nCoV-2019_75_RIGHT 1 - +MN908947.3 22797 22819 nCoV-2019_76_LEFT 2 + +MN908947.3 22798 22821 nCoV-2019_76_LEFT_alt3 2 + +MN908947.3 23192 23214 nCoV-2019_76_RIGHT 2 - +MN908947.3 23189 23212 nCoV-2019_76_RIGHT_alt0 2 - +MN908947.3 23122 23144 nCoV-2019_77_LEFT 1 + +MN908947.3 23500 23522 nCoV-2019_77_RIGHT 1 - +MN908947.3 23443 23466 nCoV-2019_78_LEFT 2 + +MN908947.3 23822 23847 nCoV-2019_78_RIGHT 2 - +MN908947.3 23789 23812 nCoV-2019_79_LEFT 1 + +MN908947.3 24145 24169 nCoV-2019_79_RIGHT 1 - +MN908947.3 24078 24100 nCoV-2019_80_LEFT 2 + +MN908947.3 24443 24467 nCoV-2019_80_RIGHT 2 - +MN908947.3 24391 24416 nCoV-2019_81_LEFT 1 + +MN908947.3 24765 24789 nCoV-2019_81_RIGHT 1 - +MN908947.3 24696 24721 nCoV-2019_82_LEFT 2 + +MN908947.3 25052 25076 nCoV-2019_82_RIGHT 2 - +MN908947.3 24978 25003 nCoV-2019_83_LEFT 1 + +MN908947.3 25347 25369 nCoV-2019_83_RIGHT 1 - +MN908947.3 25279 25301 nCoV-2019_84_LEFT 2 + +MN908947.3 25646 25673 nCoV-2019_84_RIGHT 2 - +MN908947.3 25601 25623 nCoV-2019_85_LEFT 1 + +MN908947.3 25969 25994 nCoV-2019_85_RIGHT 1 - +MN908947.3 25902 25924 nCoV-2019_86_LEFT 2 + +MN908947.3 26290 26315 nCoV-2019_86_RIGHT 2 - +MN908947.3 26197 26219 nCoV-2019_87_LEFT 1 + +MN908947.3 26566 26590 nCoV-2019_87_RIGHT 1 - +MN908947.3 26520 26542 nCoV-2019_88_LEFT 2 + +MN908947.3 26890 26913 nCoV-2019_88_RIGHT 2 - +MN908947.3 26835 26857 nCoV-2019_89_LEFT 1 + +MN908947.3 26838 26860 nCoV-2019_89_LEFT_alt2 1 + +MN908947.3 27202 27227 nCoV-2019_89_RIGHT 1 - +MN908947.3 27190 27215 nCoV-2019_89_RIGHT_alt4 1 - +MN908947.3 27141 27164 nCoV-2019_90_LEFT 2 + +MN908947.3 27511 27533 nCoV-2019_90_RIGHT 2 - +MN908947.3 27446 27471 nCoV-2019_91_LEFT 1 + +MN908947.3 27825 27854 nCoV-2019_91_RIGHT 1 - +MN908947.3 27784 27808 nCoV-2019_92_LEFT 2 + +MN908947.3 28145 28172 nCoV-2019_92_RIGHT 2 - +MN908947.3 28081 28104 nCoV-2019_93_LEFT 1 + +MN908947.3 28442 28464 nCoV-2019_93_RIGHT 1 - +MN908947.3 28394 28416 nCoV-2019_94_LEFT 2 + +MN908947.3 28756 28779 nCoV-2019_94_RIGHT 2 - +MN908947.3 28677 28699 nCoV-2019_95_LEFT 1 + +MN908947.3 29041 29063 nCoV-2019_95_RIGHT 1 - +MN908947.3 28985 29007 nCoV-2019_96_LEFT 2 + +MN908947.3 29356 29378 nCoV-2019_96_RIGHT 2 - +MN908947.3 29288 29316 nCoV-2019_97_LEFT 1 + +MN908947.3 29665 29693 nCoV-2019_97_RIGHT 1 - +MN908947.3 29486 29510 nCoV-2019_98_LEFT 2 + +MN908947.3 29836 29866 nCoV-2019_98_RIGHT 2 - diff --git a/config/config.yaml b/config/config.yaml index ae6c6d16e..120a27e48 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -52,8 +52,8 @@ preprocessing: # https://github.com/artic-network/artic-ncov2019/tree/master/primer_schemes/nCoV-2019/V4 # for more information artic-primer-version: 3 - # path to amplicon primers in bedpe format for hard-clipping on paired end files (illumina) - amplicon-primers: "resources/primer.bedpe" + # path to amplicon primers in bed format for hard-clipping on paired end files (illumina) or url to file that should be downloaded + amplicon-primers: "resources/nCoV-2019.primer.bed" # GenBank accession of reference sequence of the amplicon primers amplicon-reference: "MN908947" diff --git a/resources/nCoV-2019.primer.bed b/resources/nCoV-2019.primer.bed new file mode 100644 index 000000000..76691de9a --- /dev/null +++ b/resources/nCoV-2019.primer.bed @@ -0,0 +1,218 @@ +MN908947.3 30 54 nCoV-2019_1_LEFT 1 + +MN908947.3 385 410 nCoV-2019_1_RIGHT 1 - +MN908947.3 320 342 nCoV-2019_2_LEFT 2 + +MN908947.3 704 726 nCoV-2019_2_RIGHT 2 - +MN908947.3 642 664 nCoV-2019_3_LEFT 1 + +MN908947.3 1004 1028 nCoV-2019_3_RIGHT 1 - +MN908947.3 943 965 nCoV-2019_4_LEFT 2 + +MN908947.3 1312 1337 nCoV-2019_4_RIGHT 2 - +MN908947.3 1242 1264 nCoV-2019_5_LEFT 1 + +MN908947.3 1623 1651 nCoV-2019_5_RIGHT 1 - +MN908947.3 1573 1595 nCoV-2019_6_LEFT 2 + +MN908947.3 1942 1964 nCoV-2019_6_RIGHT 2 - +MN908947.3 1875 1897 nCoV-2019_7_LEFT 1 + +MN908947.3 1868 1890 nCoV-2019_7_LEFT_alt0 1 + +MN908947.3 2247 2269 nCoV-2019_7_RIGHT 1 - +MN908947.3 2242 2264 nCoV-2019_7_RIGHT_alt5 1 - +MN908947.3 2181 2205 nCoV-2019_8_LEFT 2 + +MN908947.3 2568 2592 nCoV-2019_8_RIGHT 2 - +MN908947.3 2505 2529 nCoV-2019_9_LEFT 1 + +MN908947.3 2504 2528 nCoV-2019_9_LEFT_alt4 1 + +MN908947.3 2882 2904 nCoV-2019_9_RIGHT 1 - +MN908947.3 2880 2902 nCoV-2019_9_RIGHT_alt2 1 - +MN908947.3 2826 2850 nCoV-2019_10_LEFT 2 + +MN908947.3 3183 3210 nCoV-2019_10_RIGHT 2 - +MN908947.3 3144 3166 nCoV-2019_11_LEFT 1 + +MN908947.3 3507 3531 nCoV-2019_11_RIGHT 1 - +MN908947.3 3460 3482 nCoV-2019_12_LEFT 2 + +MN908947.3 3826 3853 nCoV-2019_12_RIGHT 2 - +MN908947.3 3771 3795 nCoV-2019_13_LEFT 1 + +MN908947.3 4142 4164 nCoV-2019_13_RIGHT 1 - +MN908947.3 4054 4077 nCoV-2019_14_LEFT 2 + +MN908947.3 4044 4068 nCoV-2019_14_LEFT_alt4 2 + +MN908947.3 4428 4450 nCoV-2019_14_RIGHT 2 - +MN908947.3 4402 4424 nCoV-2019_14_RIGHT_alt2 2 - +MN908947.3 4294 4321 nCoV-2019_15_LEFT 1 + +MN908947.3 4296 4322 nCoV-2019_15_LEFT_alt1 1 + +MN908947.3 4674 4696 nCoV-2019_15_RIGHT 1 - +MN908947.3 4666 4689 nCoV-2019_15_RIGHT_alt3 1 - +MN908947.3 4636 4658 nCoV-2019_16_LEFT 2 + +MN908947.3 4995 5017 nCoV-2019_16_RIGHT 2 - +MN908947.3 4939 4966 nCoV-2019_17_LEFT 1 + +MN908947.3 5296 5321 nCoV-2019_17_RIGHT 1 - +MN908947.3 5230 5259 nCoV-2019_18_LEFT 2 + +MN908947.3 5257 5287 nCoV-2019_18_LEFT_alt2 2 + +MN908947.3 5620 5644 nCoV-2019_18_RIGHT 2 - +MN908947.3 5620 5643 nCoV-2019_18_RIGHT_alt1 2 - +MN908947.3 5563 5586 nCoV-2019_19_LEFT 1 + +MN908947.3 5932 5957 nCoV-2019_19_RIGHT 1 - +MN908947.3 5867 5894 nCoV-2019_20_LEFT 2 + +MN908947.3 6247 6272 nCoV-2019_20_RIGHT 2 - +MN908947.3 6167 6196 nCoV-2019_21_LEFT 1 + +MN908947.3 6168 6197 nCoV-2019_21_LEFT_alt2 1 + +MN908947.3 6528 6550 nCoV-2019_21_RIGHT 1 - +MN908947.3 6526 6548 nCoV-2019_21_RIGHT_alt0 1 - +MN908947.3 6466 6495 nCoV-2019_22_LEFT 2 + +MN908947.3 6846 6873 nCoV-2019_22_RIGHT 2 - +MN908947.3 6718 6745 nCoV-2019_23_LEFT 1 + +MN908947.3 7092 7117 nCoV-2019_23_RIGHT 1 - +MN908947.3 7035 7058 nCoV-2019_24_LEFT 2 + +MN908947.3 7389 7415 nCoV-2019_24_RIGHT 2 - +MN908947.3 7305 7332 nCoV-2019_25_LEFT 1 + +MN908947.3 7671 7694 nCoV-2019_25_RIGHT 1 - +MN908947.3 7626 7651 nCoV-2019_26_LEFT 2 + +MN908947.3 7997 8019 nCoV-2019_26_RIGHT 2 - +MN908947.3 7943 7968 nCoV-2019_27_LEFT 1 + +MN908947.3 8319 8341 nCoV-2019_27_RIGHT 1 - +MN908947.3 8249 8275 nCoV-2019_28_LEFT 2 + +MN908947.3 8635 8661 nCoV-2019_28_RIGHT 2 - +MN908947.3 8595 8619 nCoV-2019_29_LEFT 1 + +MN908947.3 8954 8983 nCoV-2019_29_RIGHT 1 - +MN908947.3 8888 8913 nCoV-2019_30_LEFT 2 + +MN908947.3 9245 9271 nCoV-2019_30_RIGHT 2 - +MN908947.3 9204 9226 nCoV-2019_31_LEFT 1 + +MN908947.3 9557 9585 nCoV-2019_31_RIGHT 1 - +MN908947.3 9477 9502 nCoV-2019_32_LEFT 2 + +MN908947.3 9834 9858 nCoV-2019_32_RIGHT 2 - +MN908947.3 9784 9806 nCoV-2019_33_LEFT 1 + +MN908947.3 10146 10171 nCoV-2019_33_RIGHT 1 - +MN908947.3 10076 10099 nCoV-2019_34_LEFT 2 + +MN908947.3 10437 10459 nCoV-2019_34_RIGHT 2 - +MN908947.3 10362 10384 nCoV-2019_35_LEFT 1 + +MN908947.3 10737 10763 nCoV-2019_35_RIGHT 1 - +MN908947.3 10666 10688 nCoV-2019_36_LEFT 2 + +MN908947.3 11048 11074 nCoV-2019_36_RIGHT 2 - +MN908947.3 10999 11022 nCoV-2019_37_LEFT 1 + +MN908947.3 11372 11394 nCoV-2019_37_RIGHT 1 - +MN908947.3 11306 11331 nCoV-2019_38_LEFT 2 + +MN908947.3 11668 11693 nCoV-2019_38_RIGHT 2 - +MN908947.3 11555 11584 nCoV-2019_39_LEFT 1 + +MN908947.3 11927 11949 nCoV-2019_39_RIGHT 1 - +MN908947.3 11863 11889 nCoV-2019_40_LEFT 2 + +MN908947.3 12234 12256 nCoV-2019_40_RIGHT 2 - +MN908947.3 12110 12133 nCoV-2019_41_LEFT 1 + +MN908947.3 12465 12490 nCoV-2019_41_RIGHT 1 - +MN908947.3 12417 12439 nCoV-2019_42_LEFT 2 + +MN908947.3 12779 12802 nCoV-2019_42_RIGHT 2 - +MN908947.3 12710 12732 nCoV-2019_43_LEFT 1 + +MN908947.3 13074 13096 nCoV-2019_43_RIGHT 1 - +MN908947.3 13005 13027 nCoV-2019_44_LEFT 2 + +MN908947.3 13007 13029 nCoV-2019_44_LEFT_alt3 2 + +MN908947.3 13378 13400 nCoV-2019_44_RIGHT 2 - +MN908947.3 13363 13385 nCoV-2019_44_RIGHT_alt0 2 - +MN908947.3 13319 13344 nCoV-2019_45_LEFT 1 + +MN908947.3 13307 13336 nCoV-2019_45_LEFT_alt2 1 + +MN908947.3 13669 13699 nCoV-2019_45_RIGHT 1 - +MN908947.3 13660 13689 nCoV-2019_45_RIGHT_alt7 1 - +MN908947.3 13599 13621 nCoV-2019_46_LEFT 2 + +MN908947.3 13602 13625 nCoV-2019_46_LEFT_alt1 2 + +MN908947.3 13962 13984 nCoV-2019_46_RIGHT 2 - +MN908947.3 13961 13984 nCoV-2019_46_RIGHT_alt2 2 - +MN908947.3 13918 13946 nCoV-2019_47_LEFT 1 + +MN908947.3 14271 14299 nCoV-2019_47_RIGHT 1 - +MN908947.3 14207 14232 nCoV-2019_48_LEFT 2 + +MN908947.3 14579 14601 nCoV-2019_48_RIGHT 2 - +MN908947.3 14545 14570 nCoV-2019_49_LEFT 1 + +MN908947.3 14898 14926 nCoV-2019_49_RIGHT 1 - +MN908947.3 14865 14895 nCoV-2019_50_LEFT 2 + +MN908947.3 15224 15246 nCoV-2019_50_RIGHT 2 - +MN908947.3 15171 15193 nCoV-2019_51_LEFT 1 + +MN908947.3 15538 15560 nCoV-2019_51_RIGHT 1 - +MN908947.3 15481 15503 nCoV-2019_52_LEFT 2 + +MN908947.3 15861 15886 nCoV-2019_52_RIGHT 2 - +MN908947.3 15827 15851 nCoV-2019_53_LEFT 1 + +MN908947.3 16186 16209 nCoV-2019_53_RIGHT 1 - +MN908947.3 16118 16144 nCoV-2019_54_LEFT 2 + +MN908947.3 16485 16510 nCoV-2019_54_RIGHT 2 - +MN908947.3 16416 16444 nCoV-2019_55_LEFT 1 + +MN908947.3 16804 16833 nCoV-2019_55_RIGHT 1 - +MN908947.3 16748 16770 nCoV-2019_56_LEFT 2 + +MN908947.3 17130 17152 nCoV-2019_56_RIGHT 2 - +MN908947.3 17065 17087 nCoV-2019_57_LEFT 1 + +MN908947.3 17430 17452 nCoV-2019_57_RIGHT 1 - +MN908947.3 17381 17406 nCoV-2019_58_LEFT 2 + +MN908947.3 17738 17761 nCoV-2019_58_RIGHT 2 - +MN908947.3 17674 17697 nCoV-2019_59_LEFT 1 + +MN908947.3 18036 18062 nCoV-2019_59_RIGHT 1 - +MN908947.3 17966 17993 nCoV-2019_60_LEFT 2 + +MN908947.3 18324 18348 nCoV-2019_60_RIGHT 2 - +MN908947.3 18253 18275 nCoV-2019_61_LEFT 1 + +MN908947.3 18650 18672 nCoV-2019_61_RIGHT 1 - +MN908947.3 18596 18618 nCoV-2019_62_LEFT 2 + +MN908947.3 18957 18979 nCoV-2019_62_RIGHT 2 - +MN908947.3 18896 18918 nCoV-2019_63_LEFT 1 + +MN908947.3 19275 19297 nCoV-2019_63_RIGHT 1 - +MN908947.3 19204 19232 nCoV-2019_64_LEFT 2 + +MN908947.3 19591 19616 nCoV-2019_64_RIGHT 2 - +MN908947.3 19548 19570 nCoV-2019_65_LEFT 1 + +MN908947.3 19911 19939 nCoV-2019_65_RIGHT 1 - +MN908947.3 19844 19866 nCoV-2019_66_LEFT 2 + +MN908947.3 20231 20255 nCoV-2019_66_RIGHT 2 - +MN908947.3 20172 20200 nCoV-2019_67_LEFT 1 + +MN908947.3 20542 20572 nCoV-2019_67_RIGHT 1 - +MN908947.3 20472 20496 nCoV-2019_68_LEFT 2 + +MN908947.3 20867 20890 nCoV-2019_68_RIGHT 2 - +MN908947.3 20786 20813 nCoV-2019_69_LEFT 1 + +MN908947.3 21146 21169 nCoV-2019_69_RIGHT 1 - +MN908947.3 21075 21104 nCoV-2019_70_LEFT 2 + +MN908947.3 21427 21455 nCoV-2019_70_RIGHT 2 - +MN908947.3 21357 21386 nCoV-2019_71_LEFT 1 + +MN908947.3 21716 21743 nCoV-2019_71_RIGHT 1 - +MN908947.3 21658 21682 nCoV-2019_72_LEFT 2 + +MN908947.3 22013 22038 nCoV-2019_72_RIGHT 2 - +MN908947.3 21961 21990 nCoV-2019_73_LEFT 1 + +MN908947.3 22324 22346 nCoV-2019_73_RIGHT 1 - +MN908947.3 22262 22290 nCoV-2019_74_LEFT 2 + +MN908947.3 22626 22650 nCoV-2019_74_RIGHT 2 - +MN908947.3 22516 22542 nCoV-2019_75_LEFT 1 + +MN908947.3 22877 22903 nCoV-2019_75_RIGHT 1 - +MN908947.3 22797 22819 nCoV-2019_76_LEFT 2 + +MN908947.3 22798 22821 nCoV-2019_76_LEFT_alt3 2 + +MN908947.3 23192 23214 nCoV-2019_76_RIGHT 2 - +MN908947.3 23189 23212 nCoV-2019_76_RIGHT_alt0 2 - +MN908947.3 23122 23144 nCoV-2019_77_LEFT 1 + +MN908947.3 23500 23522 nCoV-2019_77_RIGHT 1 - +MN908947.3 23443 23466 nCoV-2019_78_LEFT 2 + +MN908947.3 23822 23847 nCoV-2019_78_RIGHT 2 - +MN908947.3 23789 23812 nCoV-2019_79_LEFT 1 + +MN908947.3 24145 24169 nCoV-2019_79_RIGHT 1 - +MN908947.3 24078 24100 nCoV-2019_80_LEFT 2 + +MN908947.3 24443 24467 nCoV-2019_80_RIGHT 2 - +MN908947.3 24391 24416 nCoV-2019_81_LEFT 1 + +MN908947.3 24765 24789 nCoV-2019_81_RIGHT 1 - +MN908947.3 24696 24721 nCoV-2019_82_LEFT 2 + +MN908947.3 25052 25076 nCoV-2019_82_RIGHT 2 - +MN908947.3 24978 25003 nCoV-2019_83_LEFT 1 + +MN908947.3 25347 25369 nCoV-2019_83_RIGHT 1 - +MN908947.3 25279 25301 nCoV-2019_84_LEFT 2 + +MN908947.3 25646 25673 nCoV-2019_84_RIGHT 2 - +MN908947.3 25601 25623 nCoV-2019_85_LEFT 1 + +MN908947.3 25969 25994 nCoV-2019_85_RIGHT 1 - +MN908947.3 25902 25924 nCoV-2019_86_LEFT 2 + +MN908947.3 26290 26315 nCoV-2019_86_RIGHT 2 - +MN908947.3 26197 26219 nCoV-2019_87_LEFT 1 + +MN908947.3 26566 26590 nCoV-2019_87_RIGHT 1 - +MN908947.3 26520 26542 nCoV-2019_88_LEFT 2 + +MN908947.3 26890 26913 nCoV-2019_88_RIGHT 2 - +MN908947.3 26835 26857 nCoV-2019_89_LEFT 1 + +MN908947.3 26838 26860 nCoV-2019_89_LEFT_alt2 1 + +MN908947.3 27202 27227 nCoV-2019_89_RIGHT 1 - +MN908947.3 27190 27215 nCoV-2019_89_RIGHT_alt4 1 - +MN908947.3 27141 27164 nCoV-2019_90_LEFT 2 + +MN908947.3 27511 27533 nCoV-2019_90_RIGHT 2 - +MN908947.3 27446 27471 nCoV-2019_91_LEFT 1 + +MN908947.3 27825 27854 nCoV-2019_91_RIGHT 1 - +MN908947.3 27784 27808 nCoV-2019_92_LEFT 2 + +MN908947.3 28145 28172 nCoV-2019_92_RIGHT 2 - +MN908947.3 28081 28104 nCoV-2019_93_LEFT 1 + +MN908947.3 28442 28464 nCoV-2019_93_RIGHT 1 - +MN908947.3 28394 28416 nCoV-2019_94_LEFT 2 + +MN908947.3 28756 28779 nCoV-2019_94_RIGHT 2 - +MN908947.3 28677 28699 nCoV-2019_95_LEFT 1 + +MN908947.3 29041 29063 nCoV-2019_95_RIGHT 1 - +MN908947.3 28985 29007 nCoV-2019_96_LEFT 2 + +MN908947.3 29356 29378 nCoV-2019_96_RIGHT 2 - +MN908947.3 29288 29316 nCoV-2019_97_LEFT 1 + +MN908947.3 29665 29693 nCoV-2019_97_RIGHT 1 - +MN908947.3 29486 29510 nCoV-2019_98_LEFT 2 + +MN908947.3 29836 29866 nCoV-2019_98_RIGHT 2 - diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index fc9c1b674..b2778b801 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -7,6 +7,7 @@ from pathlib import Path import pandas as pd import re import random +import urllib.request from snakemake.utils import validate @@ -1610,6 +1611,16 @@ def get_input_by_mode(wildcard): return sum(paths, []) +def check_bed_for_URL(bed_file): + if "https" in bed_file: + filename = bed_file.split("/")[-1] + filepath = "resources/{}".format(filename) + urllib.request.urlretrieve(bed_file, filepath) + return filepath + else: + return bed_file + + def get_pangolin_for_report(wildcards): paths = [] diff --git a/workflow/rules/read_clipping.smk b/workflow/rules/read_clipping.smk index 937bdee0c..f2b970cfa 100644 --- a/workflow/rules/read_clipping.smk +++ b/workflow/rules/read_clipping.smk @@ -21,11 +21,24 @@ rule samtools_sort: "0.74.0/bio/samtools/sort" +rule bed_to_bedpe: + input: + check_bed_for_URL(config["preprocessing"]["amplicon-primers"]), + output: + "resources/primer.bedpe", + log: + "logs/bed-to-bedpe.log", + conda: + "../envs/python.yaml" + script: + "../scripts/bed-to-bedpe.py" + + rule bamclipper: input: bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam", bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai", - bed=config["preprocessing"]["amplicon-primers"], + bedpe="resources/primer.bedpe", output: temp( "results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam" @@ -33,7 +46,7 @@ rule bamclipper: params: output_dir=get_output_dir, cwd=lambda w: os.getcwd(), - bed_path=lambda w, input: os.path.join(os.getcwd(), input.bed), + bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe), bam=lambda w, input: os.path.basename(input.bam), log: "logs/{date}/bamclipper/{read_type}/{sample}.log", @@ -119,7 +132,7 @@ rule plot_primer_clipping: ), params: samples=lambda wildcards: get_samples_for_date(wildcards.date), - bed=config["preprocessing"]["amplicon-primers"], + bedpe="resources/primer.bedpe", log: "logs/{date}/plot-primer-clipping.log", conda: diff --git a/workflow/scripts/bed-to-bedpe.py b/workflow/scripts/bed-to-bedpe.py new file mode 100644 index 000000000..80a5f4fbe --- /dev/null +++ b/workflow/scripts/bed-to-bedpe.py @@ -0,0 +1,27 @@ +import pandas as pd + +# Function to create a bedpe file from a bed file +bed_list = [] +with open(snakemake.input[0]) as f: + line = f.readlines() + for name in line: + bed_list.append(name.split()) +df_bed = pd.DataFrame( + bed_list, columns=["chrom", "start", "end", "name", "score", "strand"] +) +df_sense = df_bed.loc[df_bed["strand"] == "+"] +df_antisense = df_bed.loc[df_bed["strand"] == "-"] +# The dataframes for the sense and antisense strands need to be set to the same index so they can be merged again for the bedpe file +df_sense.reset_index(inplace=True) +df_antisense.reset_index(inplace=True) +data = [ + df_sense["chrom"], + df_sense["start"], + df_sense["end"], + df_antisense["chrom"], + df_antisense["start"], + df_antisense["end"], +] +headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] +df_bedpe = pd.concat(data, axis=1, keys=headers) +df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a") diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index 6244e1534..d30f4dea7 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -13,7 +13,7 @@ from intervaltree import IntervalTree # read primer bedpe to df -PRIMER = pd.read_csv(snakemake.params.get("bed", ""), delimiter="\t", header=None) +PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"]