Skip to content

Commit

Permalink
speed optimization by adding a per chr for loop
Browse files Browse the repository at this point in the history
  • Loading branch information
kopardev committed Sep 29, 2021
1 parent 65e25f1 commit 2cbe9cd
Showing 1 changed file with 29 additions and 25 deletions.
54 changes: 29 additions & 25 deletions util/make_custom_species_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,38 +16,42 @@
gtf_outfile="%s.AS.STRUCTURE.COMPILED.gff"%(args.species)

df = gtfparse.read_gtf(args.gtf)
seqnames = df.seqname.unique()
bigdict = dict()
if debug==1: print("list of genes=",list(df.gene_id.unique()))

gene2loc=dict()
genedata=[]
exondata=[]

for gene in list(df.gene_id.unique()):
y=df[(df["gene_id"]==gene) & (df["feature"]=="gene")]
gene_chrom=y['seqname'].iloc[0]
gene_start=str(y['start'].iloc[0])
gene_end=str(y['end'].iloc[0])
gene_strand=y['strand'].iloc[0]
gene2loc[gene]="##".join([gene_chrom,gene_start,gene_end,gene_strand])
gene_start=str(int(gene_start)-1)
genedata.append([gene_chrom,gene_start,gene_end,gene,"0",gene_strand])
bigdict[gene]=list()
x=df[(df["gene_id"]==gene) & (df["feature"]!="gene")]
if debug==1: print("gene=",gene)
if debug==1: print("list of transcripts=",list(x.transcript_id.unique()))
for transcript in list(x.transcript_id.unique()):
y=x[(x["transcript_id"]==transcript) & (x["feature"]=="exon")]
l=0
for i,exonrow in y.iterrows():
exon_start=str(exonrow["start"]-1)
exon_end=str(exonrow["end"])
exondata.append([gene_chrom,exon_start,exon_end,gene,"0",gene_strand])
l+=(exonrow["end"]-(exonrow["start"]-1))
if debug==1: print("transcript=",transcript)
if debug==1: print("l=",l)
bigdict[gene].append(l)
if debug==1: print(bigdict)
for seq in seqnames:
print(seq)
df_seq=df[df['seqname']==seq]
for gene in list(df_seq.gene_id.unique()):
y=df_seq[(df_seq["gene_id"]==gene) & (df_seq["feature"]=="gene")]
gene_chrom=y['seqname'].iloc[0]
gene_start=str(y['start'].iloc[0])
gene_end=str(y['end'].iloc[0])
gene_strand=y['strand'].iloc[0]
gene2loc[gene]="##".join([gene_chrom,gene_start,gene_end,gene_strand])
gene_start=str(int(gene_start)-1)
genedata.append([gene_chrom,gene_start,gene_end,gene,"0",gene_strand])
bigdict[gene]=list()
x=df_seq[(df_seq["gene_id"]==gene) & (df_seq["feature"]!="gene")]
if debug==1: print("gene=",gene)
if debug==1: print("list of transcripts=",list(x.transcript_id.unique()))
for transcript in list(x.transcript_id.unique()):
y=x[(x["transcript_id"]==transcript) & (x["feature"]=="exon")]
l=0
for i,exonrow in y.iterrows():
exon_start=str(exonrow["start"]-1)
exon_end=str(exonrow["end"])
exondata.append([gene_chrom,exon_start,exon_end,gene,"0",gene_strand])
l+=(exonrow["end"]-(exonrow["start"]-1))
if debug==1: print("transcript=",transcript)
if debug==1: print("l=",l)
bigdict[gene].append(l)
if debug==1: print(bigdict)

genedf=pandas.DataFrame(genedata,columns=['chrom','start','end','geneid','score','strand'])
genedf=genedf.astype({'start':int,'end':int})
Expand Down

0 comments on commit 2cbe9cd

Please sign in to comment.