-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathgffcompare.cpp
2569 lines (2414 loc) · 94.5 KB
/
gffcompare.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "GArgs.h"
#include <ctype.h>
#include <errno.h>
#include "gtf_tracking.h"
#define VERSION "0.12.9"
#define USAGE "Usage:\n\
gffcompare [-r <reference_mrna.gtf> [-R]] [-T] [-V] [-s <seq_path>]\n\
[-o <outprefix>] [-p <cprefix>] \n\
{-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
\n\
GffCompare provides classification and reference annotation mapping and\n\
matching statistics for RNA-Seq assemblies (transfrags) or other generic\n\
GFF/GTF files.\n\
GffCompare also clusters and tracks transcripts across multiple GFF/GTF\n\
files (samples), writing matching transcripts (identical intron chains) into\n\
<outprefix>.tracking, and a GTF file <outprefix>.combined.gtf which \n\
contains a nonredundant set of transcripts across all input files (with\n\
a single representative transfrag chosen for each clique of matching transfrags\n\
across samples).\n\
\n\
Options:\n\
-v display gffcompare version (also --version)\n\
-i provide a text file with a list of (query) GTF files to process instead\n\
of expecting them as command line arguments (useful when a large number\n\
of GTF files should be processed)\n\
\n\
-r reference annotation file (GTF/GFF)\n\
-R for -r option, consider only the reference transcripts that\n\
overlap any of the input transfrags (Sn correction)\n\
-Q for -r option, consider only the input transcripts that\n\
overlap any of the reference transcripts (Precision correction);\n\
(Warning: this will discard all \"novel\" loci!)\n\
-M discard (ignore) single-exon transfrags and reference transcripts\n\
-N discard (ignore) single-exon reference transcripts\n\
-D discard \"duplicate\" query transfrags (i.e. same intron chain) within\n\
a single sample (disable \"annotation\" mode for a single file); \n\
this option is automatically enabled when multiple query files are provided\n\
-S when -D is enabled (or multiple query files are provided), perform a more \n\
strict duplicate checking: only discard matching (same intron chain) query \n\
transcripts from the same sample if their boundaries are fully contained \n\
within (or same with) matching transcripts\n\
if --strict-match is also given, exact match of all exons is required\n\
--no-merge : disable close-exon merging (default: merge exons separated by\n\
\"introns\" shorter than 5 bases\n\
\n\
-s path to genome sequences (optional); this can be either a multi-FASTA\n\
file or a directory containing single-fasta files (one for each contig);\n\
repeats must be soft-masked (lower case) in order to be able to classify\n\
transfrags as repeats\n\
\n\
-e when estimating exon level accuracy, this is the maximum range\n\
variation allowed for the free ends of terminal exons (default 100);\n\
this terminal exon restriction applies to transcript level accuracy\n\
when --strict-match option is given\n\
--strict-match : transcript matching takes into account the -e range\n\
for terminal exons; code '=' is only assigned if transcript ends are\n\
within that range, otherwise code '~' is assigned just for intron chain\n\
match (or significant overlap in the case of single exon transcripts)\n\
\n\
-d max. distance (range) for grouping transcript start sites (100)\n\
-V verbose processing mode (also shows GFF parser warnings)\n\
-T do not generate .tmap and .refmap files for each input file\n\
--chr-stats: the .stats file will show summary and accuracy data\n\
per reference contig/chromosome\n\
-j <output.tab> if -r was given, writes novel junctions in this file\n\
--debug : enables -V and generates additional files: \n\
<outprefix>.Q_discarded.lst, <outprefix>.missed_introns.gff,\n\
<outprefix>.R_missed.lst\n\
\n\
Options for the combined GTF output file:\n\
-p the name prefix to use for consensus transcripts in the \n\
<outprefix>.combined.gtf file (default: 'TCONS')\n\
-C discard matching and \"contained\" transfrags in the GTF output\n\
(i.e. collapse intron-redundant transfrags across all query files)\n\
-A like -C but does not discard intron-redundant transfrags if they start\n\
with a different 5' exon (possible alternate TSS; for the same first 5'\n\
intron, minimum TSS distance is specified by -d option, default 100)\n\
-X like -C but also discard contained transfrags even when transfrag ends \n\
stick out within the container's introns (by at most 50 bases)\n\
--cset for -C/-A/-X also discard single exon transfrags when fully contained\n\
in an exon of a multi-exon transfrag \n\
-K for -C/-A/-X, do NOT discard any redundant transfrag matching a reference\n\
"
/*
--gids : append related reference gene_id values to gene_id\n\
--gidnames : append related reference gene_name values to gene_id\n\
Note: --gids and --gidnames options are mutually exclusive!\n\
--gnames : make gene_name include reference gene_name values, if any\n\
*/
bool perContigStats=false; // -S to enable separate stats for every contig/chromosome
//bool generic_GFF=false;
//true if -G: won't discard intron-redundant transfrags
bool discardContained=false; //activated by -C, -A or -X
//bool showContained=true; // opposite of -C, default is to show them in combined.gtf
//-- moved to gtf_tracking
bool keepAltTSS=false; //-A option
bool keepRefMatching=false; //-K with -C/-A/-X
bool allowIntronSticking=false; //-X option
bool reduceQrys=false; //-Q
bool checkFasta=false;
bool tmapFiles=true;
//ref gene_id or gene_name values are separated by '|' (pipe character) when
//appended to the original gene_id
//gene_name values are separated by ',' (comma) in the gene_name attribute
//TODO implement these
bool set_gene_name=false; //gene_name set to the list of overlapping ref gene_names
bool gid_add_ref_gids=false; //append overlapping ref gene_ids to gene_id
bool gid_add_ref_gnames=false; //append overlapping ref gene_names to gene_id
bool qDupDiscard=false;
//strictMatching=false; // really match *all* exon coords for '=' class code!
// use '~' class code for intron-chain matches
// (or fuzzy single-exon transcripts overlaps)
//noMergeCloseExons=false; //prevent joining close exons?
bool only_spliced_refs=false;
int debugCounter=0;
bool gffAnnotate=false;
GStr consGTF;
int outConsCount=0;
int polyrun_range=2000; //polymerase run range 2KB
double scoreThreshold=0;
char* cprefix=NULL;
bool tprefix_req=false;
FILE* ffasta=NULL; //genomic seq file
FILE *f_ref=NULL; //reference mRNA GFF, if provided
FILE* f_in=NULL; //sequentially, each input GFF file
FILE* f_out=NULL; //stdout if not provided
GFastaHandler gfasta;
int xlocnum=0;
int tsscl_num=0; //for tss cluster IDs
int protcl_num=0; //for "unique" protein IDs within TSS clusters
uint exonEndRange=100; // -e value, only used for exon level Sn/Sp
//int total_tcons=0;
int total_xloci_alt=0;
void openfwrite(FILE* &f, GArgs& args, char opt) {
GStr s=args.getOpt(opt);
if (!s.is_empty()) {
if (s=='-')
f=stdout;
else {
f=fopen(s,"w");
if (f==NULL) GError("Error creating file: %s\n", s.chars());
}
}
}
//-- structure to keep track of data from multiple qry input files for a single genomic seq
class GSeqTrack {
int gseq_id;
public:
const char* gseq_name;
GList<GLocus>* rloci_f; //reference loci for this genomic sequence
GList<GLocus>* rloci_r;
GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
GVec<GSeqData*> qdata; //fixed order array with GSeqData for each qry input
//element in array is NULL if a qry file has no transcripts on this genomic sequence
int get_gseqid() { return gseq_id; }
GSeqTrack(int numqryfiles, int gid):xloci_f(true,true,false),
xloci_r(true,true,false), xloci_u(true,true,false), qdata() {
gseq_id=gid;
if (gseq_id>=0) {
gseq_name=GffObj::names->gseqs.getName(gseq_id);
}
rloci_f=NULL;
rloci_r=NULL;
//for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
qdata.Resize(numqryfiles, NULL);
}
bool operator==(GSeqTrack& d){
return (gseq_id==d.gseq_id);
}
bool operator<(GSeqTrack& d){
return (gseq_id<d.gseq_id);
}
};
//char* getFastaFile(int gseq_id);
// ref globals
bool haveRefs=false; //true if a reference annotation (-r) is provide
GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
//each locus will keep track of any superloci which includes it, formed during the analysis
void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
GSeqData* seqdata=NULL, GSeqData* refdata=NULL, int qfidx=-1);
GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
FILE* f_mintr=NULL; //missed ref introns (debug only)
FILE* f_qdisc=NULL; //discarded query transfrags (debug only)
FILE* f_rmiss=NULL; //missed ref transcripts (debug only)
FILE* f_nj=NULL; //-j novel junctions output file
bool multiexon_only=false;
bool multiexonrefs_only=false;
GHash<GStr*> refdescr;
void loadRefDescr(const char* fname);
GList<GStr> qryfiles(false,true,false);
//list of GSeqTrack data, sorted by gseq_id
GList<GSeqTrack> gseqtracks(true,true,true);
GSeqTrack* findGSeqTrack(int gsid);
int cmpGTrackByName(const pointer p1, const pointer p2) {
return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
}
void show_version() {
GMessage("gffcompare v%s\n", VERSION);
}
void show_usage() {
GMessage("gffcompare v%s\n", VERSION);
GMessage( "-----------------------------\n");
GMessage("%s\n", USAGE);
}
void RefReqCheck(bool v, const char* opt) {
if (v)
GError("%s\nError: option %s requires reference annotation (-r)\n",
USAGE, opt);
}
//preallocate the cmpovl slots for ref loci,
// to keep the info about the query loci overlapping each of them by qfidx
void prepRefLoci(GList<GLocus> & rloci, int qcount) {
for (int i=0;i<rloci.Count();i++)
for (int q=0;q<qcount;q++)
rloci[i]->qlocovls.Add(new GList<GLocus>(true, false, true));
}
int main(int argc, char* argv[]) {
#ifdef HEAPROFILE
if (!IsHeapProfilerRunning())
HeapProfilerStart("./gffcompare_dbg.hprof");
#endif
GArgs args(argc, argv,
"version;help;debug;gids;cset;gidnames;gnames;no-merge;strict-match;cds-match;"
"chr-stats;vACDSGEFJKLMNQTVRXhp:e:d:s:i:j:n:r:o:");
int e;
if ((e=args.isError())>0) {
show_usage();
GMessage("Invalid argument: %s\n", argv[e]);
exit(1);
}
if (args.getOpt('h') || args.getOpt("help")){
show_usage();
exit(0);
}
if (args.getOpt('v') || args.getOpt("version")){
show_version();
exit(0);
}
debug=(args.getOpt("debug")!=NULL);
tmapFiles=(args.getOpt('T')==NULL);
multiexon_only=(args.getOpt('M')!=NULL);
multiexonrefs_only=(args.getOpt('N')!=NULL);
set_gene_name=(args.getOpt("gnames")!=NULL);
gid_add_ref_gids=(args.getOpt("gids")!=NULL);
gid_add_ref_gnames=(args.getOpt("gidnames")!=NULL);
qDupDiscard=(args.getOpt('D')!=NULL);
qDupStrict=(args.getOpt('S')!=NULL);
stricterMatching=(args.getOpt("strict-match")!=NULL);
//if (stricterMatching) exonEndRange=0;
cdsMatching = (args.getOpt("cds-match") != NULL);
noMergeCloseExons=(args.getOpt("no-merge")!=NULL);
if (gid_add_ref_gids && gid_add_ref_gnames)
GError("Error: options --gids and --gidnames are mutually exclusive!\n");
perContigStats=(args.getOpt("chr-stats")!=NULL);
checkFasta=(args.getOpt('J')!=NULL);
gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
FILE* finlst=NULL;
GStr s=args.getOpt('i');
if (!s.is_empty()) {
if (s=='-')
finlst=stdin;
else {
finlst=fopen(s,"r");
if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
}
}
if (finlst) {
GLineReader* lr=new GLineReader(finlst);
char* l=NULL;
while ((l=lr->getLine())!=NULL) {
if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
qryfiles.Add(new GStr(l));
}
delete lr;
//if (qryfiles.Count()>10)
gtf_tracking_largeScale=true;
tmapFiles=false;
}
else {
numQryFiles=args.startNonOpt();
char *infile=NULL;
if (numQryFiles>0) {
if (numQryFiles>6) {
gtf_tracking_largeScale=true;
tmapFiles=false;
}
while ((infile=args.nextNonOpt())!=NULL) {
if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
qryfiles.Add(new GStr(infile));
} //for each argument
}
}
numQryFiles=qryfiles.Count();
if (numQryFiles==0) {
show_usage();
exit(1);
}
gfasta.init(args.getOpt('s'));
// determine if -s points to a multi-fasta file or a directory
//s=args.getOpt('c');
//if (!s.is_empty()) scoreThreshold=s.asReal();
s=args.getOpt('p');
if (!s.is_empty()) {
cprefix=Gstrdup(s.chars());
tprefix_req=true;
} else cprefix=Gstrdup("TCONS");
s=args.getOpt('e');
if (!s.is_empty())
exonEndRange=s.asInt();
if (stricterMatching)
terminalMatchRange=exonEndRange;
s=args.getOpt('d');
if (!s.is_empty()) tssDist=s.asInt();
s=args.getOpt('n');
if (!s.is_empty()) loadRefDescr(s.chars());
reduceRefs=(args.getOpt('R')!=NULL);
reduceQrys=(args.getOpt('Q')!=NULL);
//if a full pathname is given
//the other common output files will still be created in the current directory:
// .loci, .tracking, .stats
GStr outbasename; //include path, if provided
GStr outprefix; //without path and/or extension
GStr outstats=args.getOpt('o');
if (outstats.is_empty() || outstats=="-") {
outstats="gffcmp";
}
outbasename=outstats;
GStr outext(getFileExt(outstats.chars()));
if (outext.is_empty()) {
outext="stats";
outstats.append(".stats");
outbasename=outstats;
}
else outext.lower();
if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
outbasename.cut(outbasename.length()-outext.length()-1);
}
outprefix=outbasename;
int di=outprefix.rindex(CHPATHSEP);
if (di>=0) outprefix.cut(0,di+1);
if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
s=args.getOpt('r');
if (s.is_empty()) {
RefReqCheck(multiexonrefs_only, "-M");
RefReqCheck(reduceRefs, "-R");
RefReqCheck(reduceQrys, "-Q");
RefReqCheck(set_gene_name, "--gnames");
RefReqCheck(gid_add_ref_gids, "--gids");
RefReqCheck(gid_add_ref_gnames, "--gidnames");
} else { //reference annotation file provided
f_ref=fopen(s,"r");
if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
haveRefs=true;
if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(),
(multiexonrefs_only || multiexon_only));
haveRefs=(ref_data.Count()>0);
//if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
if (haveRefs && !gtf_tracking_largeScale) {
//prepare qlocovls for each ref locus
for (int i=0;i<ref_data.Count();i++) {
prepRefLoci(ref_data[i]->loci_f, qryfiles.Count());
prepRefLoci(ref_data[i]->loci_r, qryfiles.Count());
}
}
}
if (args.getOpt('C')) discardContained=true;
if (args.getOpt('A')) {
//redundancy check will NOT consider containment for alt. TSS
keepAltTSS=true;
if (discardContained)
GMessage("Warning: option -C ignored, -A takes precedence.\n");
discardContained=true;
}
if (args.getOpt('X')) {
discardContained=true;
allowIntronSticking=true;
}
if (args.getOpt("cset")) {
if (!discardContained) {
GMessage("Warning: --cset option ignored, requires -C, -A or -X\n");
} else {
cSETMerge=true;
}
}
keepRefMatching=(args.getOpt('K')!=NULL);
if (keepRefMatching && !discardContained) {
GMessage("Warning: -K option ignored, requires -C, -A or -X\n");
keepRefMatching=false;
}
s=args.getOpt('j');
if (!s.is_empty()) {
f_nj=fopen(s.chars(),"w");
if (f_nj==NULL) GError("Error creating file %s!\n",s.chars());
}
if (debug) { //create a few more files potentially useful for debugging
s=outbasename;
s.append(".missed_introns.gff");
f_mintr=fopen(s.chars(),"w");
if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
s=outbasename;
s.append(".R_missed.gff");
f_rmiss=fopen(s.chars(),"w");
if (f_rmiss==NULL) GError("Error creating file %s!\n",s.chars());
if (reduceQrys) { //only if -Q option was used
s=outbasename;
s.append(".Q_discarded.lst");
f_qdisc=fopen(s.chars(),"w");
if (f_qdisc==NULL) GError("Error creating file %s!\n",s.chars());
}
}
f_out=fopen(outstats, "w");
if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
fprintf(f_out, "# gffcompare v%s | Command line was:\n#", VERSION);
for (int i=0;i<argc-1;i++)
fprintf(f_out, "%s ", argv[i]);
fprintf(f_out, "%s\n#\n", argv[argc-1]);
//int qfileno=0;
GList<GSeqData>** qrysdata=NULL;
FILE** tfiles=NULL;
FILE** rtfiles=NULL;
GMALLOC(qrysdata, numQryFiles*sizeof(GList<GSeqData>*));
if (tmapFiles) {
GMALLOC(tfiles, numQryFiles*sizeof(FILE*));
if (haveRefs) {
GMALLOC(rtfiles, numQryFiles*sizeof(FILE*));
}
}
gffAnnotate=(numQryFiles==1 && !discardContained && haveRefs && !qDupDiscard
&& !qDupStrict && !tprefix_req);
consGTF=outbasename;
if (gffAnnotate) consGTF.append(".annotated.gtf");
else consGTF.append(".combined.gtf");
for (int fi=0;fi<qryfiles.Count();fi++) {
GStr in_file(qryfiles[fi]->chars());
GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
GStr indir(qryfiles[fi]->chars());
di=indir.rindex(CHPATHSEP);
if (di>=0) indir.cut(di+1); //directory path for this input file
else indir=""; //current directory
//if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
if (qryfiles.Count()>1)
GMessage("Loading query file #%d: %s\n",fi+1, in_file.chars());
if (in_file=="-") { f_in=stdin; in_file="stdin"; }
else {
f_in=fopen(in_file.chars(),"r");
if (f_in==NULL)
GError("Cannot open input file %s!\n",in_file.chars());
}
//f_in is the query gff file to process
GStr sbase(indir);
sbase.append(outprefix);
sbase.append(".");
sbase.append(infname);
if (tmapFiles) {
//-- we should keep the infname path, otherwise the remaining file names
// may be the same and clobber each other
s=sbase;
s.append(".tmap");
tfiles[fi]=fopen(s.chars(),"w");
if (tfiles[fi]==NULL)
GError("Error creating file '%s'!\n",s.chars());
fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tqry_gene_id\tqry_id\tnum_exons\tFPKM\tTPM\tcov\tlen\tmajor_iso_id\tref_match_len\n");
if (haveRefs) {
s=sbase;
s.append(".refmap");
rtfiles[fi]=fopen(s.chars(),"w");
if (rtfiles[fi]==NULL)
GError("Error creating file '%s'!\n",s.chars());
fprintf(rtfiles[fi],"ref_gene\tref_id\tclass_code\tqry_id_list\n");
}
}
GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
qrysdata[fi]=pdata;
//int discard_check=discard_redundant;
//if (keepRefMatching) {
// discard_check=0;
//}
read_mRNAs(f_in, *pdata, &ref_data, !gffAnnotate, fi, in_file.chars(), multiexon_only);
GSuperLocus gstats;
for (int g=0;g<pdata->Count();g++) { //for each genomic sequence in this qry dataset
int gsid=pdata->Get(g)->get_gseqid();
GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
if (!gtf_tracking_largeScale)
processLoci(*(pdata->Get(g)), refdata, fi);
GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
// for gsid
if (refdata!=NULL) {
seqtrack->rloci_f=&(refdata->loci_f);
seqtrack->rloci_r=&(refdata->loci_r);
}
seqtrack->qdata[fi]=pdata->Get(g);
//will only gather data into stats if perContig==false
if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
pdata->Get(g), refdata, fi);
} //for each genomic sequence data for the current query file
//-- there could also be genomic sequences with no qry transcripts
// but only reference transcripts so they weren't found above
if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
for (int r=0;r<ref_data.Count();r++) {
GSeqData* refdata=ref_data[r];
int gsid=refdata->get_gseqid();
if (getQryData(gsid, *pdata)==NULL) {
reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
}//completely missed all refdata on this contig
}
}
//now report the summary:
if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
//qfileno++;
}//for each query file
if (f_mintr!=NULL) fclose(f_mintr); //to write missed introns
if (f_qdisc!=NULL) fclose(f_qdisc); //to write discarded query transcripts
if (f_rmiss!=NULL) fclose(f_rmiss); //to write missed references
if (f_nj!=NULL) fclose(f_nj); //to write missed introns
gseqtracks.setSorted(&cmpGTrackByName);
if (gtf_tracking_verbose && numQryFiles>1)
GMessage("Tracking transcripts across %d query file(s)..\n", numQryFiles);
trackGData(numQryFiles, gseqtracks, outbasename, tfiles, rtfiles);
fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
if (numQryFiles>1) {
fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
}
int redundant_consensi=GXConsensus::count-outConsCount;
fprintf(f_out, "%d out of %d consensus transcripts written in %s (%d discarded as redundant)\n",
outConsCount, GXConsensus::count, consGTF.chars(), redundant_consensi);
if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
GFREE(cprefix);
// clean up
for (int i=0;i<numQryFiles;i++) {
delete qrysdata[i];
}
GFREE(qrysdata);
GFREE(tfiles);
GFREE(rtfiles);
gseqtracks.Clear();
FWCLOSE(f_out);
if (gtf_tracking_verbose) GMessage("Done.\n");
ref_data.Clear();
//getchar();
} //main ends here
void show_exons(FILE* f, GffObj& m) {
fprintf(f,"(");
int imax=m.exons.Count()-1;
for (int i=0;i<=imax;i++) {
if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
}
}
bool exon_match(GXSeg& r, GXSeg& q, uint fuzz=0) {
uint sd = (r.start>q.start) ? r.start-q.start : q.start-r.start;
uint ed = (r.end>q.end) ? r.end-q.end : q.end-r.end;
uint ex_range=exonEndRange;
if (ex_range<=fuzz) ex_range=fuzz;
if ((r.flags&1) && (q.flags&1)) { // first exon ?
if (sd>ex_range) return false;
}
else {
if (sd>fuzz) return false;
}
if ((r.flags&2) && (q.flags&2)) { // last exon ?
if (ed>ex_range) return false;
}
else {
if (ed>fuzz) return false;
}
return true;
}
void addQLocOvl(GLocus* rloc, GLocus* qloc, int qfidx) {
//adds a qloc to a list of qloc overlaps (cmpovl list) based on qfidx
//only applies to a ref locus
if (qfidx>=rloc->qlocovls.Count())
GError("Error: addQLocOvl() not ready!\n");
rloc->qlocovls[qfidx]->Add(qloc);
}
void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
GList<GLocus>& refloci, int qfidx) {
cmpdata.Clear();//a new list of superloci will be built
if (refloci.Count()==0 || loci.Count()==0) return;
//reset cmpovl and stats
for (int i=0;i<refloci.Count();i++)
refloci[i]->creset();
//find loci with overlapping refloci
//and store cmpovl links both ways for ALL loci and refloci on this strand
for (int l=0;l<loci.Count();l++) {
GLocus* locus=loci[l];
locus->creset();
for (int j=0;j<refloci.Count();j++) {
if (refloci[j]->start>locus->end) {
if (refloci[j]->start > locus->end + GFF_MAX_LOCUS) break;
continue;
}
if (locus->start>refloci[j]->end) continue;
//must check for proper exon overlap:
if (locus->exonOverlap(*refloci[j])) {
//cmpovl is a list of exon-overlapping loci
locus->cmpovl.Add(refloci[j]); //qry locus adds this overlapping ref locus
refloci[j]->cmpovl.Add(locus); //ref locus adds this overlapping qry locus
addQLocOvl(refloci[j], locus, qfidx);
}
}//for each reflocus
} //for each locus
//create corresponding "superloci" from transitive overlapping between loci and ref
for (int l=0;l<loci.Count();l++) {
if (loci[l]->v!=0) continue; //skip, already processed
GSuperLocus* super=new GSuperLocus();
super->qfidx=qfidx;
//try to find all other loci connected to this locus loci[l]
GPVec<GLocus> lstack(false); //traversal stack
lstack.Push(loci[l]);
while (lstack.Count()>0) {
GLocus* locus=lstack.Pop();
if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
super->addQlocus(*locus);
locus->v=1;
for (int r=0;r<locus->cmpovl.Count();r++) {
GLocus* rloc=locus->cmpovl[r];
if (rloc->v==0) {
super->addRlocus(*rloc);
rloc->v=1;
for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
if (rloc->cmpovl[ll]->v==0)
lstack.Push(rloc->cmpovl[ll]);
}
}
} //for each overlapping ref locus
} //while linking
if (super->qloci.Count()==0) {
delete super;
continue; //try next query locus
}
//--here we have a "superlocus" region data on both qry and ref
// -- analyze mexons matching (base level metrics)
cmpdata.Add(super);
//make each ref locus keep track of all superloci containing it
for (int x=0;x<super->rmexons.Count();x++) {
super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
}
for (int x=0;x<super->qmexons.Count();x++) {
super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
}
int i=0; //locus mexons
int j=0; //refmexons
while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
uint istart=super->qmexons[i].start;
uint iend=super->qmexons[i].end;
uint jstart=super->rmexons[j].start;
uint jend=super->rmexons[j].end;
if (iend<jstart) { i++; continue; }
if (jend<istart) { j++; continue; }
//v--overlap here:
uint ovlstart = jstart>istart? jstart : istart;
uint ovlend = iend<jend ? iend : jend;
uint ovlen=ovlend-ovlstart+1;
super->baseTP+=ovlen; //qbases_cov
if (iend<jend) i++;
else j++;
} //while mexons ovl search
/* if (reduceRefs) {
super->baseFP=super->qbases_all-super->baseTP;
super->baseFN=super->rbases_all-super->baseTP;
}
*/
// -- exon level comparison:
int* qexovl; //flags for qry exons with ref overlap
GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
int* rexovl; //flags for ref exons with qry overlap
GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
for (int i=0;i<super->quexons.Count();i++) {
uint istart=super->quexons[i].start;
uint iend=super->quexons[i].end;
for (int j=0;j<super->ruexons.Count();j++) {
uint jstart=super->ruexons[j].start;
uint jend=super->ruexons[j].end;
if (iend<jstart) break;
if (jend<istart) continue;
//--- overlap here between quexons[i] and ruexons[j]
qexovl[i]++;
rexovl[j]++;
/*if (exon_match(super->quexons[i], super->ruexons[j],5)) {
if (!ATPfound) { //count a ref approx match only once
super->exonATP++;
ATPfound=true;
}*/
if (exon_match(super->quexons[i], super->ruexons[j])) {
if ((super->ruexons[j].flags & 4)==0) {
super->exonTP++;
super->ruexons[j].flags |= 4;
}
if ((super->quexons[i].flags & 4)==0) {
super->exonQTP++;
super->quexons[i].flags |= 4;
}
} //exact match
//} //fuzzy match
} //ref uexon loop
} //qry uexon loop
//DEBUG only:
//if (super->exonTP > super->total_rexons)
// GMessage("Warning: superlocus %d-%d has exonTP %d > total rexons %d\n",
// super->start, super->end, super->exonTP, super->total_rexons);
super->m_exons=0; //ref exons with no query overlap
super->w_exons=0; //qry exons with no ref overlap
for (int x=0;x<super->quexons.Count();x++)
if (qexovl[x]==0) super->w_exons++;
for (int x=0;x<super->ruexons.Count();x++)
if (rexovl[x]==0) super->m_exons++;
GFREE(rexovl);
GFREE(qexovl);
//-- intron level stats:
//query:
int* qinovl=NULL; //flags for qry introns with at some ref intron overlap
int* qtpinovl=NULL; //flags for qry introns with ref intron match
if (super->qintrons.Count()>0) {
GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
}
//-- reference:
int* rinovl=NULL; //flags for ref introns with qry overlap
int* rtpinovl=NULL; //ref introns with qry intron match
if (super->rintrons.Count()>0) {
GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
}
for (int i=0;i<super->qintrons.Count();i++) {
uint istart=super->qintrons[i].start;
uint iend=super->qintrons[i].end;
for (int j=0;j<super->rintrons.Count();j++) {
uint jstart=super->rintrons[j].start;
uint jend=super->rintrons[j].end;
if (iend<jstart) break;
if (jend<istart) continue;
//--- overlap here between qintrons[i] and rintrons[j]
qinovl[i]++;
rinovl[j]++;
if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
super->intronTP++;
qtpinovl[i]++;
rtpinovl[j]++;
} //exact match
} //ref intron loop
} //qry intron loop
super->m_introns=0; //ref introns with no query overlap (missed introns)
super->w_introns=0; //qry introns with no ref overlap (wrong introns)
for (int x=0;x<super->qintrons.Count();x++) {
if (qinovl[x]==0) {
super->w_introns++;
//qry introns with no ref intron overlap at all
super->i_qwrong.Add(super->qintrons[x]);
} else if (qtpinovl[x]==0) { //novel introns = qry introns with no ref intron match
super->i_qnotp.Add(super->qintrons[x]);
}
}
for (int x=0;x<super->rintrons.Count();x++) {
if (rinovl[x]==0) { //no qry intron overlap at all
super->m_introns++; //ref introns totally missed = not having any query intron overlaps
super->i_missed.Add(super->rintrons[x]);
} else if (rtpinovl[x]==0) { //no qry intron match
//ref introns with with no qry intron match
super->i_notp.Add(super->rintrons[x]);
}
}
GFREE(rinovl);
GFREE(rtpinovl);
GFREE(qinovl);
GFREE(qtpinovl);
// ---- now intron-chain and transcript matching
GVec<char> matched_refs(super->rmrnas.Count(), '\0'); //keep track of matched refs
//'=' when "exact" (within terminalMatchRange), or if no strict matching was requested
//'~' when stricter transcript matching is activated and only the intron chain was matched
//GVec<int> amatched_refs(super->rmrnas.Count(), 0); //keep track of fuzzy-matched refs
for (int i=0;i<super->qmrnas.Count();i++) {
uint istart=super->qmrnas[i]->exons.First()->start;
uint iend=super->qmrnas[i]->exons.Last()->end;
for (int j=0;j<super->rmrnas.Count();j++) {
if (matched_refs[j]=='=') continue; //already counted as ichainTP and mrnaTP
uint jstart=super->rmrnas[j]->exons.First()->start;
uint jend=super->rmrnas[j]->exons.Last()->end;
if (iend<jstart) break;
if (jend<istart) continue;
//--- overlapping transcripts ---
if (super->qmrnas[i]->udata & 2) continue; //already found a matching ref for this
GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
GLocus* rlocus=((CTData*)super->rmrnas[j]->uptr)->locus;
int ovlen=0;
//look for a transcript match ('=' code for full exact exons match, '~' )
char tmatch=transcriptMatch(*(super->qmrnas[i]),*(super->rmrnas[j]), ovlen, terminalMatchRange);
//bool isTMatch=(tmatch>0);
if (tmatch) {
if (!stricterMatching) tmatch='=';
//at least the intron chains match !
if (super->qmrnas[i]->exons.Count()>1) {
super->ichainTP++;
qlocus->ichainTP++;
if ((super->qmrnas[i]->udata & 4) == 0) {
super->qmrnas[i]->udata |= 4;
}
if (matched_refs[j]==0) {
rlocus->ichainTP++;
matched_refs[j]=tmatch;
}
}
if (tmatch=='=') { //"full" or strict match
super->mrnaTP++;
qlocus->mrnaTP++;
rlocus->mrnaTP++;
if ((super->qmrnas[i]->udata & 2) ==0) {
super->qmrnas[i]->udata|=2;
}
matched_refs[j]='=';
}
} // ichain match found
} //ref loop
} //qry loop
//-- show missed references (not "matched" either '~' or '=') if requested
if (f_rmiss!=NULL) {
for (int i=0;i<super->rmrnas.Count();i++) {
if (matched_refs[i]==0)
super->rmrnas[i]->printGxf(f_rmiss, pgffAny);
}
}
for (int ql=0;ql<super->qloci.Count();ql++) {
if (super->qloci[ql]->mrnaTP>0)
super->locusQTP++;
}
for (int rl=0;rl<super->rloci.Count();rl++) {
if (super->rloci[rl]->mrnaTP >0 )
super->locusTP++;
}
}//for each unlinked locus
}
//look for qry data for a specific genomic sequence
GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
int qi=-1;
GSeqData f(gid);
GSeqData* q=NULL;
if (qdata.Found(&f,qi))
q=qdata[qi];
return q;
}
const char* findDescr(GffObj* gfobj) {
if (refdescr.Count()==0) return NULL;
GStr* s=refdescr.Find(gfobj->getID());
if (s==NULL) {
s=refdescr.Find(gfobj->getGeneName());
if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
}
if (s!=NULL)
return s->chars();
return NULL;
}
const char* getGeneID(GffObj* gfobj) {
//returns anything that might resemble a gene identifier for this transcript
//or, if everything fails, returns the transcript ID
const char* s=gfobj->getGeneID();
if (s) return s;
if ((s=gfobj->getGeneName())!=NULL) return s;
if ((s=gfobj->getAttr("Name"))!=NULL) return s;
return gfobj->getID();
}
const char* getGeneNameID(GffObj& gfobj) {
//returns anything that might resemble a gene name or gene identifier for the transcript
//or, if everything fails, returns the transcript ID
const char* s=gfobj.getGeneName();
if (s) return s;
if ((s=gfobj.getGeneID())!=NULL) return s;
if ((s=gfobj.getAttr("Name"))!=NULL) return s;
return gfobj.getID();
}
const char* getGeneID(GffObj& gfobj) {
return getGeneID(&gfobj);
}
void writeLoci(FILE* f, GList<GLocus> & loci) {
for (int l=0;l<loci.Count();l++) {
GLocus& loc=*(loci[l]);
fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
loc.mrna_maxcov->getGSeqName(),
loc.mrna_maxcov->strand, loc.start,loc.end);
//now print all transcripts in this locus, comma delimited
int printfd=0;
for (int i=0;i<loc.mrnas.Count();i++) {
if (loc.mrnas[i]==loc.mrna_maxcov) continue;
if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
else fprintf(f,",%s",loc.mrnas[i]->getID());
printfd++;
}
const char* rdescr=findDescr(loc.mrna_maxcov);
if (rdescr==NULL) fprintf(f,"\t\n");
else fprintf(f,"\t%s\n",rdescr);
}
}
void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
int printfd=0;
//print
for (int i=0;i<qloci.Count();i++) {
if (qloci[i]->qfidx!=qidx) continue;
for (int j=0;j<qloci[i]->mrnas.Count();j++) {
if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
printfd++;
}
}
if (printfd==0) fprintf(f,"-");
}
void numXLoci(GList<GXLocus>& xloci, int& last_id) {
for (int l=0;l<xloci.Count();l++) {
if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
last_id++;
xloci[l]->id=last_id;
}
}
class GProtCl {
public:
GList<GXConsensus> protcl;
GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
if (c!=NULL)
protcl.Add(c);
}
bool add_Pcons(GXConsensus* c) {
if (c==NULL || c->aalen==0) return false;
if (protcl.Count()==0) {
protcl.Add(c);
return true;
}
if (protcl[0]->aalen!=c->aalen) return false;
if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
protcl.Add(c);
return true;
}