-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathreform.py
executable file
·599 lines (543 loc) · 24.3 KB
/
reform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
#!/bin/env python
import argparse
import re
import os
import tempfile
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
try:
import pgzip as gzip_module
print("Using pgzip for gzip operations.")
except ImportError:
import gzip as gzip_module
print("pgzip not found, falling back to gzip.")
def main():
## Retrieve command line arguments and number of iterations
in_arg, iterations = get_input_args()
## List for previous postion and modification length
prev_modifications = []
## Path for the files generated in sequential processing, mainly managed by tempfile.
prev_fasta_path = None
prev_gff_path = None
## Sequential processing
for index in range(iterations):
## Read the new fasta (to be inserted into the ref genome)
try:
record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
except IndexError:
filename = in_arg.in_fasta[index].name
raise ValueError(f"Error: {filename} is not a valid FASTA file.")
except Exception as e:
raise ValueError(f"Error parsing FASTA file: {str(e)}")
## Generate index of sequences from ref reference fasta
if prev_fasta_path:
chrom_seqs = index_fasta(prev_fasta_path)
os.remove(prev_fasta_path)
else:
chrom_seqs = index_fasta(in_arg.ref_fasta)
## Obtain the sequence of the chromosome we want to modify
seq = chrom_seqs[in_arg.chrom]
seq_str = str(seq.seq)
## Get the position to insert the new sequence
positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications)
position = positions['position']
down_position = positions['down_position']
## Save current modification which include position(index) and length changed.
if position == down_position:
length_changed = len(str(record.seq))
else:
length_changed = len(str(record.seq)) - (down_position - position - 1)
prev_modifications.append((position,length_changed))
if position != down_position:
print("Removing nucleotides from position {} - {}".format(position, down_position))
print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}"
.format(record.description, in_arg.in_fasta[index], position, in_arg.chrom))
## Build the new chromosome sequence with the inserted_seq
## If the chromosome sequence length is in the header, replace it with new length
new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:]
chrom_length = str(len(seq_str))
new_length = str(len(new_seq))
new_record = SeqRecord(
Seq(new_seq),
id=seq.id,
description=seq.description.replace(chrom_length, new_length)
)
## Create new fasta file with modified chromosome
if index < iterations - 1:
new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa')
new_fasta_file.close()
new_fasta = new_fasta_file.name
prev_fasta_path = new_fasta
else:
ref_basename, _ = get_ref_basename(in_arg.ref_fasta)
ref_name = ref_basename
new_fasta = ref_name + '_reformed.fa'
with open(new_fasta, "w") as f:
for s in chrom_seqs:
if s == seq.id:
SeqIO.write([new_record], f, "fasta")
else:
SeqIO.write([chrom_seqs[s]], f, "fasta")
print("New fasta file created: ", new_fasta)
print("Preparing to create new annotation file")
## Read in new GFF features from in_gff
in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
## Create a temp file for gff, if index is not equal to last iteration
annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
print(in_arg.ref_gff)
if index < iterations - 1:
temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
temp_gff_name = temp_gff.name
temp_gff.close()
if prev_gff_path:
new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
os.remove(prev_gff_path)
else:
new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
else:
new_gff_name = annotation_name + '_reformed' + annotation_ext
if prev_gff_path:
new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
else:
new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
prev_gff_path = new_gff_path
print("New {} file created: {} ".format(annotation_ext.upper(), prev_gff_path))
def index_fasta(fasta_path):
'''
Process incoming FASTA files, supporting both decompressed and uncompressed
formats. It takes a file path (fasta_path), decompresses the file into a
temporary file if it is a compressed file ending in .gz, and then indexes
the temporary file. Finally, the indexing result is returned.
'''
if fasta_path.endswith('.gz'):
## Create a tempfile to store uncompressde content
with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
tmp_f_path = tmp_f.name
## Use pgzip or gzip to decompress parallely. Set thread=None means use all cores
with gzip_module.open(fasta_path, 'rt', thread=None) as f:
tmp_f.write(f.read())
chrom_seqs = SeqIO.index(tmp_f_path, 'fasta')
## remove temp file
os.remove(tmp_f_path)
else:
chrom_seqs = SeqIO.index(fasta_path, 'fasta')
return chrom_seqs
def get_ref_basename(filepath):
'''
Takes a filepath and returns the filename and extension minus the .gz.
'''
base = os.path.basename(filepath)
if base.endswith('.gz'):
base = base[:-3] # remove .gz
name, ext = os.path.splitext(base)
return name, ext
def modify_gff_line(elements, start=None, end=None, comment=None):
'''
Modifies an existing GFF line and returns the modified line. Currently, you can
override the start position, end position, and comment column.
gff_out: an open file handle for writing new gff lines to
elements: a list containing each column (9 in total) of a single feature line in a gff file
start: if provided, overwrite the start position in elements with this start position
end: if provided, overwrite the end position in elements with this end position
comment: if provided, overwrite the comments column in elements with this comment
'''
if start == None:
start = int(elements[3])
if end == None:
end = int(elements[4])
if comment == None:
comment = elements[8]
if not comment.endswith('\n'):
comment += '\n'
## Return the modified line
return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment))
def get_in_gff_lines(in_gff):
'''
Takes a gff file and returns a list of lists where
each parent list item is a single line of the gff file
and the child elements are the columns of the line
'''
with open(in_gff, "r") as f:
in_gff_lines = []
for line in f:
line_elements = line.split('\t')
# Skip comment lines
if line.startswith("#"):
continue
# Ensure lines have 9 columns
if len(line_elements) != 9:
print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
print(line_elements)
exit()
in_gff_lines.append(line_elements)
return in_gff_lines
def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications):
'''
Determine the position in seq_str to insert the new sequence given
the position, upstream, downstream, and chrom arguments.
Note that either position, or upstream AND downstream sequences must
be provided.
'''
if positions and index < len(positions) and positions[index] >= -1:
print("Checking position validity")
position = positions[index]
## Update current postion based on previous modification
for pos, lc in prev_modifications:
## Also ignore when postion == -1
if position >= pos:
position += lc
if position > len(seq_str):
print("** ERROR: Position greater than length of chromosome.")
print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position))
exit()
elif position == -1:
position = len(seq_str)
# At the moment we don't accept down position as a param
# so set down_position = position
down_position = position
print("Position valid")
else:
print("No valid position specified, checking for upstream and downstream sequence")
if index < len(upstream) and index < len(downstream):
seq_str = seq_str.upper()
upstream_fasta = list(SeqIO.parse(upstream[index], "fasta"))
upstream_seq = str(upstream_fasta[0].seq).upper()
downstream_fasta = list(SeqIO.parse(downstream[index], "fasta"))
downstream_seq = str(downstream_fasta[0].seq).upper()
# Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die
upstream_seq_count = seq_str.count(upstream_seq)
downstream_seq_count = seq_str.count(downstream_seq)
if upstream_seq_count == 1 and downstream_seq_count == 1:
## Obtain the starting position of the left_strand
new_index = seq_str.find(upstream_seq)
position = new_index + len(upstream_seq)
down_position = seq_str.find(downstream_seq)
else:
print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.")
print("Chromosome: {}\n".format(chrom))
print("Upstream sequence found {} times".format(upstream_seq_count))
print("Downstream sequence found {} times".format(downstream_seq_count))
exit()
else:
print("** ERROR: You must specify a valid position or upstream and downstream sequences.")
exit()
if down_position < position:
print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.")
exit()
return {'position': position, 'down_position': down_position}
def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom):
'''
in_gff_lines: a list of lists where each nested list is a list of
columns (in gff format) associated with each new feature to insert
split_features: contains information about features in the original GFF
file that were split due to the insertion of the new sequence.
sequence_length: length of the inserted sequence, used to determine
the new end positions in the GFF file.
'''
## Replace the chromosome ID from in_gff with the correct chromosome ID
for l in in_gff_lines:
l[0] = chrom
# Handling of single-line comments
if len(in_gff_lines) == 1:
l = in_gff_lines[0]
## Check length
## l[3] is start position of fasta in in.gtf and l[4] is end position
seq_id = l[0]
if int(l[4]) - int(l[3]) + 1 != sequence_length:
print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.")
## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta
new_gff_line = modify_gff_line(
l, start=1 + position, end=sequence_length + position)
gff_out.write(new_gff_line)
# Handling of multiple-line comments
else:
### Step1: extract all start and end into corresponding set()
start_positions = set()
end_positions = set()
for l in in_gff_lines:
start_positions.add(int(l[3]))
end_positions.add(int(l[4]))
### Step2: Find min start and max end
min_start = min(start_positions)
max_end = max(end_positions)
### Step3: Check sequence length, validness of min_start and max_end
if max_end - min_start + 1 != sequence_length:
raise ValueError(f"Error: Annotation length does not match sequence length. "
f"Expected {sequence_length}, but got {max_end - min_start + 1}")
if min_start < 1:
raise ValueError(f"Error: Invalid min_start value: {min_start}. It must be a positive integer.")
if min_start > max_end:
raise ValueError(f"Error: min_start ({min_start}) cannot be greater than max_end ({max_end}).")
### Step4: Adjust start and end. Offset will be 0 if no adjust need
offset = min_start - 1
for l in in_gff_lines:
### Step5: Correct start(l[3]) and end(l[4]) by minus offset
new_gff_line = modify_gff_line(
l, start = int(l[3]) - offset + position, end = int(l[4]) - offset + position)
gff_out.write(new_gff_line)
## If insertion caused any existing features to be split, add
## the split features now immediately after adding the new features
for sf in split_features:
modified_line = modify_gff_line(
sf[0], start = sf[1], end = sf[2], comment = sf[3])
gff_out.write(modified_line)
## Return True after writing the new GFF lines
return True
def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length):
'''
Goes line by line through a gff file to remove existing features
(or parts of existing features) and/or insert new features.
In the process of adding new features, existing features
may be cut-off on either end or split into two.
This attempts to handle all cases.
new_gff_name: the name of the new gff file to create
ref_gff: the reference gff file to modify
in_gff_lines: a list of lists where each nested list is a list of
columns (in gff format) associated with each new feature to insert
position: start position of removal of existing sequence
down_position: end position of removal of existing sequence
chrom_id: the ID of the chromosome to modify
new_seq_length: the length of the new sequence being added to the chromosome
'''
with open(new_gff_name, "w") as gff_out:
in_gff_lines_appended = False
split_features = []
last_seen_chrom_id = None
gff_ext = new_gff_name.split('.')[-1]
ref_gff_path = ref_gff
if ref_gff.endswith('.gz'):
with gzip_module.open(ref_gff, 'rt') as f:
## Create a tempfile to store uncompressde content
with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
tmp_f.write(f.read())
ref_gff_path = tmp_f.name
with open(ref_gff_path, "r") as f:
for line in f:
line_elements = line.split('\t')
if line.startswith("#"):
if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id:
## Edit the length of the chromosome
original_length = int(line_elements[3])
new_length = original_length - (down_position - position) + new_seq_length
line = line.replace(str(original_length), str(new_length))
gff_out.write(line)
else:
gff_chrom_id = line_elements[0]
gff_feat_start = int(line_elements[3])
gff_feat_end = int(line_elements[4])
gff_feat_type = line_elements[2]
gff_feat_strand = line_elements[6]
gff_comments = line_elements[8].strip()
# If we've seen at least one chromosome
# and the last chromosome seen was the chromosome of interest (i.e. chrom_id)
# and now we're on a new chromosome in the original gff file
# and the in_gff_lines have not yet been appended:
# we assume they need to be appended to the end of the chromosome
# so append them before proceeding
if (last_seen_chrom_id is not None
and last_seen_chrom_id == chrom_id
and gff_chrom_id != last_seen_chrom_id
and not in_gff_lines_appended):
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
last_seen_chrom_id = gff_chrom_id
# If this is not the chromosome of interest
# Or if the current feature ends before any
# modification (which occurs at position)
# Then simply write the feature as is (no modification)
# (remember gff co-ordinates are 1 based and position
# is 0 based, therefor check less than *or equal to*)
if gff_chrom_id != chrom_id or gff_feat_end <= position:
gff_out.write(line)
# Modify chromosome feature length (if feature is
# "chromosome" or "region")
elif gff_feat_type in ['chromosome', 'region']:
original_length = gff_feat_end
new_length = original_length - (down_position - position) + new_seq_length
line = line.replace(str(original_length), str(new_length))
gff_out.write(line)
# Split feature into 2 if feature starts before position
# and ends after down_position
elif gff_feat_start <= position and gff_feat_end > down_position:
print("Feature split")
# Which side of the feature depends on the strand (we add this as a comment)
(x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5")
new_comment = format_comment(
"original feature split by inserted sequence, this is the {} prime end".format(x),
gff_ext
)
# Modified feature ends at 'position'
modified_line = modify_gff_line(
line_elements,
end = position,
comment = gff_comments + new_comment
)
gff_out.write(modified_line)
# The downstream flank(s) will be added immediately after the
# in_gff (new) features are written.
# First, attempt to rename IDs (to indicate split)
renamed_id_attributes = rename_id(line)
new_comment = format_comment(
"original feature split by inserted sequence, this is the {} prime end".format(y),
gff_ext
)
split_features.append(
(
line_elements,
position + new_seq_length + 1,
gff_feat_end + new_seq_length - (down_position - position),
renamed_id_attributes + new_comment
)
)
# Change end position of feature to cut off point (position) if the
# feature ends within the deletion (between position & down_position)
elif gff_feat_start <= position and gff_feat_end <= down_position:
# Which side of the feature depends on the strand (we add this as a comment)
x = "3" if gff_feat_strand == "+" else "5"
print("Feature cut off - {} prime side of feature cut off ({} strand)"
.format(x, gff_feat_strand))
new_comment = format_comment(
"{} prime side of feature cut-off by inserted sequence".format(x),
gff_ext
)
modified_line = modify_gff_line(
line_elements,
end = position,
comment = gff_comments + new_comment
)
gff_out.write(modified_line)
# Skip this feature if it falls entirely within the deletion
elif gff_feat_start > position and gff_feat_end <= down_position:
print("Skip feature (this feature was removed from sequence)")
continue
else:
if not in_gff_lines_appended:
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
# Change start position of feature to after cutoff point if
# the feature starts within the deletion
if (gff_feat_start > position
and gff_feat_start <= down_position
and gff_feat_end > down_position):
x = "5" if gff_feat_strand == "+" else "3"
print("Feature cut off - {} prime side of feature cut off ({} strand)"
.format(x, gff_feat_strand))
new_comment = format_comment(
"{} prime side of feature cut-off by inserted sequence".format(x),
gff_ext
)
modified_line = modify_gff_line(
line_elements,
start = position + new_seq_length + 1,
end = gff_feat_end + new_seq_length - (down_position - position),
comment = gff_comments + new_comment
)
gff_out.write(modified_line)
# Offset all downstream feature positions by offset length
elif gff_feat_start > down_position:
offset_length = new_seq_length - (down_position - position)
modified_line = modify_gff_line(
line_elements,
start = gff_feat_start + offset_length,
end = gff_feat_end + offset_length
)
gff_out.write(modified_line)
else:
print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements))
exit()
# If we've iterated over the entire original gff
# (i.e. out of the for loop which iterates over it)
# and still haven't written in_gff_lines
# and the last chromosome seen was the chromosome of interest (i.e. chrom_id):
# we assume they need to be appended to the end of the genome
# (i.e. end of the last chromosome)
# so append them now
if (last_seen_chrom_id is not None
and last_seen_chrom_id == chrom_id
and not in_gff_lines_appended):
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
# Checking to ensure in_gff_lines written
if not in_gff_lines_appended:
print("** Error: Something went wrong, in_gff not added to reference gff. Exiting")
exit()
# Remove temp gtf file
if ref_gff.endswith('.gz'):
os.remove(ref_gff_path)
return new_gff_name
def format_comment(comment, ext):
'''
Format comment according to ext (GFF or GTF) and return
'''
if ext.lower() == 'gtf':
new_comment = 'reform_comment "{}";'.format(comment)
elif ext.lower().startswith('gff'):
new_comment = "; reform_comment={}".format(comment)
else:
print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext))
exit()
return new_comment
def rename_id(line):
'''
Given a gff or gtf line, this function will append the string "_split"
to the end of the ID/gene_id attribute
'''
attributes = line.split('\t')[8].strip()
elements = attributes.split(';')
if elements[0].startswith("ID="):
print("Renaming split feature {} --> {}_split".format(elements[0], elements[0]))
return ("{}_split;{}".format(elements[0], ';'.join(elements[1:])))
elif elements[0].startswith("gene_id "):
gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1]
print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id))
return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:])))
else:
print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line)
return attributes
def get_input_args():
parser = argparse.ArgumentParser()
parser.add_argument('--chrom', type = str, required = True,
help = "Chromosome name (String)")
parser.add_argument('--in_fasta', type=str, required=True,
help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format")
parser.add_argument('--in_gff', type=str, required=True,
help="Path(s) to GFF file(s) describing new fasta sequence(s) to be inserted")
parser.add_argument('--upstream_fasta', type=str, default=None,
help="Path(s) to Fasta file(s) with upstream sequence. Either position, or upstream AND downstream sequence must be provided.")
parser.add_argument('--downstream_fasta', type=str, default=None,
help="Path(s) to Fasta file(s) with downstream sequence. Either position, or upstream AND downstream sequence must be provided.")
parser.add_argument('--position', type=str, default=None,
help="Comma-separated positions at which to insert new sequence. Note: Position is 0-based, no space between each comma. Either position, or upstream AND downstream sequence must be provided.")
parser.add_argument('--ref_fasta', type = str, required = True,
help = "Path to reference fasta file")
parser.add_argument('--ref_gff', type = str, required = True,
help = "Path to reference gff file")
in_args = parser.parse_args()
in_args.in_fasta = in_args.in_fasta.split(',')
in_args.in_gff = in_args.in_gff.split(',')
if in_args.upstream_fasta:
in_args.upstream_fasta = in_args.upstream_fasta.split(',')
if in_args.downstream_fasta:
in_args.downstream_fasta = in_args.downstream_fasta.split(',')
if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None):
print("** Error: You must provide either the position, or the upstream and downstream sequences.")
exit()
if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta):
print("** Error: The number of upstream_fasta and downstream_fasta files does not match.")
exit()
if in_args.position is not None:
try:
in_args.position = list(map(int, in_args.position.split(',')))
iterations = iterations = len(in_args.position)
except ValueError:
print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.")
exit()
else:
iterations = len(in_args.upstream_fasta)
if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations):
print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.")
exit()
return in_args, iterations
if __name__ == "__main__":
main()