Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add proksee and gff output #139

Merged
merged 36 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
01619b3
add contig length in contig object
JeanMainguy Aug 10, 2023
8dfb1ac
write and read contig length
JeanMainguy Aug 11, 2023
6d1b6fc
add gff output
JeanMainguy Aug 11, 2023
fc5bca2
fix module and Spot loading
JeanMainguy Aug 11, 2023
95497b7
add source information
JeanMainguy Aug 30, 2023
b87a2ee
Merge remote-tracking branch 'origin/AnnotHDF5Reformat' into gff_output
JeanMainguy Sep 19, 2023
d1f2231
add gene line for each CDS ARN
JeanMainguy Sep 21, 2023
67867c0
Merge remote-tracking branch 'origin/dev' into gff_output
JeanMainguy Sep 21, 2023
1dab861
adapt proksee panorama code to ppanggolin
JeanMainguy Sep 25, 2023
bffb2b8
improve proksee output
JeanMainguy Sep 25, 2023
7bcc7a4
add possibility to add sequences
JeanMainguy Oct 2, 2023
c1ed656
Merge remote-tracking branch 'origin/context' into gff_output
JeanMainguy Oct 4, 2023
77c3be8
fix wrong completion calculation in module outputs
JeanMainguy Oct 12, 2023
0b6f6f4
Merge branch 'dev' into gff_output
JeanMainguy Oct 17, 2023
32bd30a
remove the need of a proksee template
JeanMainguy Oct 17, 2023
a4795fe
color module with different colors according their proximity
JeanMainguy Oct 18, 2023
53dac90
clean proksee generation code and add type and docstring
JeanMainguy Oct 18, 2023
1e15c31
reuse projection fct to parse genome paths file
JeanMainguy Oct 18, 2023
6f95236
add proksee output to projection
JeanMainguy Oct 19, 2023
ae013a5
add sequence to gff file when equired
JeanMainguy Oct 19, 2023
7a01e27
add gff output in projection cmd
JeanMainguy Oct 19, 2023
fb4fc13
add argument to output or not gff and proksee
JeanMainguy Oct 19, 2023
8a13856
add proksee own output dir in write cmd
JeanMainguy Oct 19, 2023
334ad70
Add limit in networkx version for compatibility problem with python3.8
jpjarnoux Oct 20, 2023
0c590c6
fix error in parsing fasta file
JeanMainguy Oct 20, 2023
a0f6132
add seq to proksee and gff output for proj cmd
JeanMainguy Oct 20, 2023
8e447c8
fix bug in proksee output
JeanMainguy Oct 20, 2023
a1fb03e
add gff and proksee in the gh action
JeanMainguy Oct 20, 2023
068bbba
fix error in gh action
JeanMainguy Oct 20, 2023
4defeca
resolve some review comment
jpjarnoux Oct 23, 2023
95171ee
Fix variable name
jpjarnoux Oct 23, 2023
5eb755a
Fix bug in get contig length in gff files
jpjarnoux Oct 23, 2023
accd2c1
resolve PR review
jpjarnoux Oct 23, 2023
61dacfe
refac launch of projection
JeanMainguy Oct 23, 2023
b0b7ef1
Refactoring
jpjarnoux Oct 24, 2023
d58a253
Merge branch 'gff_output' of https://github.com/labgem/PPanGGOLiN int…
jpjarnoux Oct 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
Expand Down Expand Up @@ -134,10 +134,10 @@ jobs:
run: |
cd testingDataset
head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee


ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
--organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--spot_graph --graph_formats graphml --fast --keep_tmp -f
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee

6 changes: 6 additions & 0 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
while not line.startswith('//'):
sequence += line[10:].replace(" ", "").strip().upper()
line = lines.pop()

contig.add_contig_length(len(sequence))
jpjarnoux marked this conversation as resolved.
Show resolved Hide resolved

# get each gene's sequence.
for gene in contig.genes:
gene.add_sequence(get_dna_sequence(sequence, gene))
Expand Down Expand Up @@ -368,6 +371,9 @@ def get_id_attribute(attributes_dict: dict) -> str:
if has_fasta and fasta_string != "":
contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length
for contig in org.contigs:

contig.add_contig_length(len(contig_sequences[contig.name]))

for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
for rna in contig.RNAs:
Expand Down
61 changes: 2 additions & 59 deletions ppanggolin/context/searchGeneContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import os
from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set
from itertools import zip_longest, chain
from itertools import chain
from collections import defaultdict
from pathlib import Path

Expand All @@ -21,7 +21,7 @@
# local libraries
from ppanggolin.formats import check_pangenome_info
from ppanggolin.genome import Gene, Contig
from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not
from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window
from ppanggolin.pangenome import Pangenome
from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \
get_input_seq_to_family_with_all, get_seq_ids
Expand Down Expand Up @@ -445,63 +445,6 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int,
yield next_gene_index


def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int,
is_circular: bool = False):
"""
Extracts contiguous windows around positions of interest within a contig.

:param contig_size: Number of genes in contig.
:param positions_of_interest: An iterable containing the positions of interest.
:param window_size: The size of the window to extract around each position of interest.
:param is_circular: Indicates if the contig is circular.
:return: Yields tuples representing the start and end positions of each contiguous window.
"""
windows_coordinates = []

# Sort the positions of interest
sorted_positions = sorted(positions_of_interest)

# Check if any position of interest is out of range
if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size:
raise IndexError(f'Positions of interest are out of range. '
f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions")

if is_circular:
first_position = sorted_positions[0]
last_position = sorted_positions[-1]
# in a circular contig, if the window of a gene of interest overlaps the end/start of the contig
# an out of scope position is added to the sorted positions to take into account those positions
# the returned window are always checked that its positions are not out of range...
# so there's no chance to find an out of scope position in final list
if first_position - window_size < 0:
out_of_scope_position = contig_size + first_position
sorted_positions.append(out_of_scope_position)

if last_position + window_size >= contig_size:
out_of_scope_position = last_position - contig_size
sorted_positions.insert(0, out_of_scope_position)

start_po = max(sorted_positions[0] - window_size, 0)

for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]):

if next_po is None:
# If there are no more positions, add the final window
end_po = min(position + window_size, contig_size - 1)
windows_coordinates.append((start_po, end_po))

elif position + window_size + 1 < next_po - window_size:
# If there is a gap between positions, add the current window
# and update the start position for the next window
end_po = min(position + window_size, contig_size - 1)

windows_coordinates.append((start_po, end_po))

start_po = max(next_po - window_size, 0)

return windows_coordinates


def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]:
"""
Group genes from specified gene families by contig.
Expand Down
Loading
Loading