bump to 1.2.2

lmdu · Jul 13, 2020 · c53c3b2 · c53c3b2
1 parent 098c1c5
commit c53c3b2
Show file tree

Hide file tree

Showing 11 changed files with 5,628 additions and 5,054 deletions.
diff --git a/setup/win.iss b/setup/win.iss
@@ -2,7 +2,7 @@
 ; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES!
 
 #define MyAppName "Krait"
-#define MyAppVersion "1.2.1"
+#define MyAppVersion "1.2.2"
 #define MyAppPublisher "Lianming Du"
 #define MyAppURL "https://github.com/lmdu/krait"
 #define MyAppExeName "Krait.exe"

diff --git a/src/config.py b/src/config.py
@@ -2,9 +2,9 @@
 import os
 import appdirs
 
-VERSION = "1.2.1"
+VERSION = "1.2.2"
 
-BUILD = '20200521'
+BUILD = '20200713'
 
 ROOT_PATH = os.path.abspath(os.path.dirname(__file__))
 

diff --git a/src/gff.py b/src/gff.py
@@ -56,26 +56,28 @@ def parse(self):
 			if line[0] == '#': continue
 
 			cols = line.strip().split('\t')
-			
+
 			record = Data(
 				seqid = cols[0],
 				feature = cols[2].upper(),
 				start = int(cols[3]),
 				end = int(cols[4]),
 				attrs = Data()
 			)
-			
+
 			for item in cols[-1].split(';'):
 				if not item:
 					continue
-				
+
 				#if _format == 'GFF':
 				#	name, value = item.split('=')
 				#else:
 				#	name, value = item.strip().strip('"').split('"')
+				try:
+					name, value = self.split_val(item)
+				except ValueError:
+					continue
 
-				name, value = self.split_val(item)
-
 				record.attrs[name.strip().upper()] = value
 
 			yield record
@@ -104,10 +106,10 @@ def create_interval_tree(self):
 
 			if feature[0] != prev_chrom:
 				if starts:
-					starts = numpy.array(starts, dtype=numpy.int32)
-					ends = numpy.array(ends, dtype=numpy.int32)
-					indexes = numpy.array(indexes, dtype=numpy.int32)
-					self.interval_forest[prev_chrom] = ncls.NCLS32(starts, ends, indexes)
+					starts = numpy.array(starts, dtype=numpy.int64)
+					ends = numpy.array(ends, dtype=numpy.int64)
+					indexes = numpy.array(indexes, dtype=numpy.int64)
+					self.interval_forest[prev_chrom] = ncls.NCLS64(starts, ends, indexes)
 
 				prev_chrom = feature[0]
 				starts = []
@@ -119,10 +121,10 @@ def create_interval_tree(self):
 			indexes.append(feat_id)
 
 		if starts:
-			starts = numpy.array(starts, dtype=numpy.int32)
-			ends = numpy.array(ends, dtype=numpy.int32)
-			indexes = numpy.array(indexes, dtype=numpy.int32)
-			self.interval_forest[prev_chrom] = ncls.NCLS32(starts, ends, indexes)
+			starts = numpy.array(starts, dtype=numpy.int64)
+			ends = numpy.array(ends, dtype=numpy.int64)
+			indexes = numpy.array(indexes, dtype=numpy.int64)
+			self.interval_forest[prev_chrom] = ncls.NCLS64(starts, ends, indexes)
 
 	def mapping(self, chrom, start, end):
 		if chrom not in self.interval_forest:
@@ -138,7 +140,7 @@ def mapping(self, chrom, start, end):
 		for candidate in ['CDS', 'exon', 'UTR', 'intron']:
 			for feat, gid in feats:
 				if candidate in feat:
-					return (self.featid_mapping[feat], self.gene_mapping[gid])
+					return (self.featid_mapping[feat], self.gene_mapping[gid]) 
 
 		return None
 
@@ -192,12 +194,29 @@ def get_gene_mapping(self):
 			self.gene_info.append((gene_num, row.seqid, row.start, row.end, gene_id, gene_name, biotype))
 
 	def get_features(self):
+		chrom = None
 		father = None
 		exons = []
 
 		parents = {}
 
 		for r in self.parse():
+			if r.seqid != chrom:
+				if exons:
+					exons = sorted(exons, key=lambda x: x[2])
+
+					for idx, exon in enumerate(exons):
+						yield exon
+
+						if idx < len(exons)-1:
+							start = exon[2] + 1
+							end = exons[idx+1][1] - 1
+							yield (exons[0][0], start, end, 'intron', exons[0][4])
+
+				chrom = r.seqid
+				exons = []
+				father = None
+
 			if r.feature == 'REGION':
 				continue
 
@@ -258,15 +277,15 @@ def get_features(self):
 					except:
 						parents[r.attrs.ID] = r.attrs.ID
 
-		exons = sorted(exons, key=lambda x: x[2])
-
-		for idx, exon in enumerate(exons):
-			yield exon
+		if exons:
+			exons = sorted(exons, key=lambda x: x[2])
+			for idx, exon in enumerate(exons):
+				yield exon
 
-			if idx < len(exons)-1:
-				start = exon[2] + 1
-				end = exons[idx+1][1] - 1
-				yield (exons[0][0], start, end, 'intron', exons[0][4])
+				if idx < len(exons)-1:
+					start = exon[2] + 1
+					end = exons[idx+1][1] - 1
+					yield (exons[0][0], start, end, 'intron', exons[0][4])
 
 class GTFParser(AnnotParser):
 	def split_val(self, item):
@@ -288,12 +307,28 @@ def get_gene_mapping(self):
 			self.gene_info.append((gene_num, row.seqid, row.start, row.end, gene_id, gene_name, biotype))
 
 	def get_features(self):
+		chrom = None
 		father = None
 		exons = []
-		for row in self.parse():
-			parent = row.attrs.GENE_ID
+		for r in self.parse():
+			if r.seqid != chrom:
+				if exons:
+					exons = sorted(exons, key=lambda x: x[1])
+
+					for idx, exon in enumerate(exons):
+						yield exon
+
+						if idx < len(exons)-1:
+							start = exon[2] + 1
+							end = exons[idx+1][1] - 1
+							yield (exons[0][0], start, end, 'intron', exons[0][4])
+				exons = []
+				chrom = None
+				father = None
+
+			parent = r.attrs.GENE_ID
 
-			if row.feature == 'CDS':
+			if r.feature == 'CDS':
 				yield (r.seqid, r.start, r.end, 'CDS', parent)
 
 			elif r.feature == 'FIVE_PRIME_UTR':

diff --git a/src/libs/__init__.py b/src/libs/__init__.py
@@ -4,5 +4,5 @@
 #from . import intersection
 #from . import primerdesign
 #from . import fasta
-from . import ncls32 as ncls
+from . import ncls
 from . import issr
diff --git a/src/libs/src/ncls/src/intervaldb32.h b/src/libs/src/ncls/src/intervaldb32.h
@@ -97,15 +97,15 @@ extern int find_intervals(IntervalIterator *it0, int32_t start, int32_t end,Inte
 }
 
 #define HAS_OVERLAP_POSITIVE(IM,START,END) (((IM).start>=0) ? \
-    ((IM).start<=(START) && (END)<=(IM).end) \
+    ((IM).start<(END) && (START)<(IM).end) \
   : (-((IM).end)<(END) && (START) < -((IM).start)))
  /* ????? MERGE_INTERVAL_ORIENTATIONS ??????? */
 
 #else
 /* STANDARD MACROS */
 #define START_POSITIVE(IM) ((IM).start)
 #define END_POSITIVE(IM) ((IM).end)
-#define HAS_OVERLAP_POSITIVE(IM,START,END) ((IM).start<=(START) && (END)<=(IM).end)
+#define HAS_OVERLAP_POSITIVE(IM,START,END) ((IM).start<(END) && (START)<(IM).end)
 
 #endif