speed optimization by adding a per chr for loop

kopardev · Sep 29, 2021 · 2cbe9cd · 2cbe9cd
1 parent 65e25f1
commit 2cbe9cd
Showing 1 changed file with 29 additions and 25 deletions.
diff --git a/util/make_custom_species_files.py b/util/make_custom_species_files.py
@@ -16,38 +16,42 @@
 gtf_outfile="%s.AS.STRUCTURE.COMPILED.gff"%(args.species)
 
 df = gtfparse.read_gtf(args.gtf)
+seqnames = df.seqname.unique()
 bigdict = dict()
 if debug==1: print("list of genes=",list(df.gene_id.unique()))
 
 gene2loc=dict()
 genedata=[]
 exondata=[]
 
-for gene in list(df.gene_id.unique()):
-    y=df[(df["gene_id"]==gene) & (df["feature"]=="gene")]
-    gene_chrom=y['seqname'].iloc[0]
-    gene_start=str(y['start'].iloc[0])
-    gene_end=str(y['end'].iloc[0])
-    gene_strand=y['strand'].iloc[0]
-    gene2loc[gene]="##".join([gene_chrom,gene_start,gene_end,gene_strand])
-    gene_start=str(int(gene_start)-1)
-    genedata.append([gene_chrom,gene_start,gene_end,gene,"0",gene_strand])
-    bigdict[gene]=list()
-    x=df[(df["gene_id"]==gene) & (df["feature"]!="gene")]
-    if debug==1: print("gene=",gene)
-    if debug==1: print("list of transcripts=",list(x.transcript_id.unique()))
-    for transcript in list(x.transcript_id.unique()):
-        y=x[(x["transcript_id"]==transcript) & (x["feature"]=="exon")]
-        l=0
-        for i,exonrow in y.iterrows():
-            exon_start=str(exonrow["start"]-1)
-            exon_end=str(exonrow["end"])
-            exondata.append([gene_chrom,exon_start,exon_end,gene,"0",gene_strand])
-            l+=(exonrow["end"]-(exonrow["start"]-1))
-        if debug==1: print("transcript=",transcript)
-        if debug==1: print("l=",l)
-        bigdict[gene].append(l)
-    if debug==1: print(bigdict)
+for seq in seqnames:
+    print(seq)
+    df_seq=df[df['seqname']==seq]
+    for gene in list(df_seq.gene_id.unique()):
+        y=df_seq[(df_seq["gene_id"]==gene) & (df_seq["feature"]=="gene")]
+        gene_chrom=y['seqname'].iloc[0]
+        gene_start=str(y['start'].iloc[0])
+        gene_end=str(y['end'].iloc[0])
+        gene_strand=y['strand'].iloc[0]
+        gene2loc[gene]="##".join([gene_chrom,gene_start,gene_end,gene_strand])
+        gene_start=str(int(gene_start)-1)
+        genedata.append([gene_chrom,gene_start,gene_end,gene,"0",gene_strand])
+        bigdict[gene]=list()
+        x=df_seq[(df_seq["gene_id"]==gene) & (df_seq["feature"]!="gene")]
+        if debug==1: print("gene=",gene)
+        if debug==1: print("list of transcripts=",list(x.transcript_id.unique()))
+        for transcript in list(x.transcript_id.unique()):
+            y=x[(x["transcript_id"]==transcript) & (x["feature"]=="exon")]
+            l=0
+            for i,exonrow in y.iterrows():
+                exon_start=str(exonrow["start"]-1)
+                exon_end=str(exonrow["end"])
+                exondata.append([gene_chrom,exon_start,exon_end,gene,"0",gene_strand])
+                l+=(exonrow["end"]-(exonrow["start"]-1))
+            if debug==1: print("transcript=",transcript)
+            if debug==1: print("l=",l)
+            bigdict[gene].append(l)
+        if debug==1: print(bigdict)
 
 genedf=pandas.DataFrame(genedata,columns=['chrom','start','end','geneid','score','strand'])
 genedf=genedf.astype({'start':int,'end':int})