-
Notifications
You must be signed in to change notification settings - Fork 96
/
doBlastzChainNet.pl
executable file
·1669 lines (1553 loc) · 59.2 KB
/
doBlastzChainNet.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/doBlastzChainNet.pl instead.
# $Id: doBlastzChainNet.pl,v 1.33 2010/04/12 16:33:12 hiram Exp $
# to-do items:
# - lots of testing
# - better logging: right now it just passes stdout and stderr,
# leaving redirection to a logfile up to the user
# - -swapBlastz, -loadBlastz
# - -tDb, -qDb
# - -tUnmasked, -qUnmasked
# - -axtBlastz
# - another Gill wish list item: save a lav header (involves run-blastz-ucsc)
# - 2bit / multi-sequence support when abridging?
# - reciprocal best?
# - hgLoadSeq of query instead of assuming there's a $qDb database?
use Getopt::Long;
use warnings;
use strict;
use FindBin qw($Bin);
use lib "$Bin";
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
use File::Basename;
# Hardcoded paths/command sequences:
my $getFileServer = '/cluster/bin/scripts/fileServer';
my $blastzRunUcsc = "$Bin/blastz-run-ucsc";
my $partition = "$Bin/partitionSequence.pl";
my $clusterLocal = '/scratch/hg';
my $clusterSortaLocal = '/iscratch/i';
my @clusterNAS = ('/cluster/bluearc', '/san/sanvol1');
my $clusterNAS = join('/... or ', @clusterNAS) . '/...';
my @clusterNoNo = ('/cluster/home', '/projects');
my @fileServerNoNo = ('kkhome', 'kks00');
my @fileServerNoLogin = ('kkusr01', '10.1.1.3', '10.1.10.11',
'sanhead1', 'sanhead2', 'sanhead3', 'sanhead4',
'sanhead5', 'sanhead6', 'sanhead7', 'sanhead8');
# Option variable names, both common and peculiar to doBlastz:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_blastzOutRoot
$opt_swap
$opt_chainMinScore
$opt_chainLinearGap
$opt_tRepeats
$opt_qRepeats
$opt_readmeOnly
$opt_ignoreSelf
$opt_syntenicNet
$opt_noDbNameCheck
$opt_inclHap
$opt_noLoadChainSplit
$opt_loadChainSplit
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'partition', func => \&doPartition },
{ name => 'blastz', func => \&doBlastzClusterRun },
{ name => 'cat', func => \&doCatRun },
{ name => 'chainRun', func => \&doChainRun },
{ name => 'chainMerge', func => \&doChainMerge },
{ name => 'net', func => \&netChains },
{ name => 'load', func => \&loadUp },
{ name => 'download', func => \&doDownloads },
{ name => 'cleanup', func => \&cleanup },
{ name => 'syntenicNet',func => \&doSyntenicNet }
]
);
# Option defaults:
# my $bigClusterHub = 'swarm';
my $bigClusterHub = 'ku';
# my $smallClusterHub = 'encodek';
my $smallClusterHub = 'ku';
my $dbHost = 'hgwdev';
my $workhorse = 'hgwdev';
my $defaultChainLinearGap = "loose";
my $defaultChainMinScore = "1000"; # from axtChain itself
my $defaultTRepeats = ""; # for netClass option tRepeats
my $defaultQRepeats = ""; # for netClass option qRepeats
my $defaultSeq1Limit = 30;
my $defaultSeq2Limit = 100;
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
my $base = $0;
$base =~ s/^(.*\/)?//;
# Basic help (for incorrect usage):
print STDERR "
usage: $base DEF
options:
";
print STDERR $stepper->getOptionHelp();
print STDERR <<_EOF_
-blastzOutRoot dir Directory path where outputs of the blastz cluster
run will be stored. By default, they will be
stored in the $HgAutomate::clusterData build directory , but
this option can specify something more cluster-
friendly: $clusterNAS .
If dir does not already exist it will be created.
Blastz outputs are removed in the cleanup step.
-swap DEF has already been used to create chains; swap
those chains (target for query), then net etc. in
a new directory:
$HgAutomate::clusterData/\$qDb/$HgAutomate::trackBuild/blastz.\$tDb.swap/
-chainMinScore n Add -minScore=n (default: $defaultChainMinScore) to the
axtChain command.
-chainLinearGap type Add -linearGap=<loose|medium|filename> to the
axtChain command. (default: loose)
-tRepeats table Add -tRepeats=table to netClass (default: rmsk)
-qRepeats table Add -qRepeats=table to netClass (default: rmsk)
-ignoreSelf Do not assume self alignments even if tDb == qDb
-syntenicNet Perform optional syntenicNet step
-noDbNameCheck ignore Db name format
-inclHap include haplotypes *_hap* in chain/net, default not
-loadChainSplit load split chain tables, default is not split tables
_EOF_
;
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => $workhorse,
'fileServer' => '',
'bigClusterHub' => $bigClusterHub,
'smallClusterHub' => $smallClusterHub);
print STDERR "
Automates UCSC's blastz/chain/net pipeline:
1. Big cluster run of blastz.
2. Small cluster consolidation of blastz result files.
3. Small cluster chaining run.
4. Sorting and netting of chains on the fileserver
(no nets for self-alignments).
5. Generation of liftOver-suitable chains from nets+chains on fileserver
(not done for self-alignments).
6. Generation of axtNet and mafNet files on the fileserver (not for self).
7. Addition of gap/repeat info to nets on hgwdev (not for self).
8. Loading of chain and net tables on hgwdev (no nets for self).
9. Setup of download directory on hgwdev.
10.Optional (-syntenicNet flag): Generation of syntenic mafNet files.
DEF is a Scott Schwartz-style bash script containing blastz parameters.
This script makes a lot of assumptions about conventional placements of
certain files, and what will be in the DEF vars. Stick to the conventions
described in the -help output, pray to the cluster gods, and all will go
well. :)
";
# Detailed help (-help):
print STDERR "
Assumptions:
1. $HgAutomate::clusterData/\$db/ is the main directory for database/assembly \$db.
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/ will be the directory
created for this run, where \$tDb is the target/reference db and
\$qDb is the query. (Can be overridden, see #10 below.)
$dbHost:$HgAutomate::goldenPath/\$tDb/vs\$QDb/ (or vsSelf)
is the directory where downloadable files need to go.
LiftOver chains (not applicable for self-alignments) go in this file:
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/liftOver/\$tDbTo\$QDb.over.chain.gz
a copy is kept here (in case the liftOver/ copy is overwritten):
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/\$tDb.\$qDb.over.chain.gz
and symbolic links to the liftOver/ file are put here:
$dbHost:$HgAutomate::goldenPath/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz
$dbHost:$HgAutomate::gbdb/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz
2. DEF's SEQ1* variables describe the target/reference assembly.
DEF's SEQ2* variables describe the query assembly.
If those are the same assembly, then we're doing self-alignments and
will drop aligned blocks that cross the diagonal.
3. DEF's SEQ1_DIR is either a directory containing one nib file per
target sequence (usually chromosome), OR a complete path to a
single .2bit file containing all target sequences. This directory
should be in $clusterLocal or $clusterSortaLocal .
SEQ2_DIR: ditto for query.
4. DEF's SEQ1_LEN is a tab-separated dump of the target database table
chromInfo -- or at least a file that contains all sequence names
in the first column, and corresponding sizes in the second column.
Normally this will be $HgAutomate::clusterData/\$tDb/chrom.sizes, but for a
scaffold-based assembly, it is a good idea to put it in $clusterSortaLocal
or $clusterNAS
because it will be a large file and it is read by blastz-run-ucsc
(big cluster script).
SEQ2_LEN: ditto for query.
5. DEF's SEQ1_CHUNK and SEQ1_LAP determine the step size and overlap size
of chunks into which large target sequences are to be split before
alignment. SEQ2_CHUNK and SEQ2_LAP: ditto for query.
6. DEF's SEQ1_LIMIT and SEQ2_LIMIT decide what the maximum number of
sequences should be for any partitioned file (the files created in the
tParts and qParts directories). This limit only effects SEQ1 or SEQ2
when they are 2bit files. Some 2bit files have too many contigs. This
reduces the number of blastz hippos (jobs taking forever compared to
the other jobs). SEQ1_LIMIT defaults to $defaultSeq1Limit and SEQ2_LIMIT defaults to $defaultSeq2Limit.
7. DEF's BLASTZ_ABRIDGE_REPEATS should be set to something nonzero if
abridging of lineage-specific repeats is to be performed. If so, the
following additional constraints apply:
a. Both target and query assemblies must be structured as one nib file
per sequence in SEQ*_DIR (sorry, this rules out scaffold-based
assemblies).
b. SEQ1_SMSK must be set to a directory containing one file per target
sequence, with the name pattern \$seq.out.spec. This file must be
a RepeatMasker .out file (usually filtered by DateRepeats). The
directory should be under $clusterLocal or $clusterSortaLocal .
SEQ2_SMSK: ditto for query.
8. DEF's BLASTZ_[A-Z] variables will be translated into blastz command line
options (e.g. BLASTZ_H=foo --> H=foo, BLASTZ_Q=foo --> Q=foo).
For human-mouse evolutionary distance/sensitivity, none of these are
necessary (blastz-run-ucsc defaults will be used). Here's what we have
used for human-fugu and other very-distant pairs:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=$HgAutomate::clusterData/blastz/HoxD55.q
Blastz parameter tuning is somewhat of an art and is beyond the scope
here. Webb Miller and Jim can provide guidance on how to set these for
a new pair of organisms.
9. DEF's PATH variable, if set, must specify a path that contains programs
necessary for blastz to run: blastz, and if BLASTZ_ABRIDGE_REPEATS is set,
then also fasta-subseq, strip_rpts, restore_rpts, and revcomp.
If DEF does not contain a PATH, blastz-run-ucsc will use its own default.
10. DEF's BLASTZ variable can specify an alternate path for blastz.
11. DEF's BASE variable can specify the blastz/chain/net build directory
(defaults to $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/).
12. SEQ?_CTGDIR specifies sequence source with the contents of full chrom
sequences and the contig randoms and chrUn. This keeps the contigs
separate during the blastz and chaining so that chains won't go through
across multiple contigs on the randoms.
13. SEQ?_CTGLEN specifies a length file to be used in conjunction with the
special SEQ?_CTGDIR file specified above which contains the random contigs.
14. SEQ?_LIFT specifies a lift file to lift sequences in the SEQ?_CTGDIR
to their random and chrUn positions. This is useful for a 2bit file that
has both full chrom sequences and the contigs for the randoms.
15. SEQ2_SELF=1 specifies the SEQ2 is already specially split for self
alignments and to use SEQ2 sequence for self alignment, not just a
copy of SEQ1
16. TMPDIR - specifies directory on cluster node to keep temporary files
Typically TMPDIR=/scratch/tmp
17. All other variables in DEF will be ignored!
" if ($detailed);
exit $status;
}
# Globals:
my %defVars = ();
my ($DEF, $tDb, $qDb, $QDb, $isSelf, $selfSplit, $buildDir, $fileServer);
my ($swapDir, $splitRef, $inclHap, $secondsStart, $secondsEnd);
sub isInDirList {
# Return TRUE if $dir is under (begins with) something in dirList.
my ($dir, @dirList) = @_;
my $pat = '^(' . join('|', @dirList) . ')(/.*)?$';
return ($dir =~ m@$pat@);
}
sub enforceClusterNoNo {
# Die right away if user is trying to put cluster output somewhere
# off-limits.
my ($dir, $desc) = @_;
if (&isInDirList($dir, @clusterNoNo)) {
die "\ncluster outputs are forbidden to go to " .
join (' or ', @clusterNoNo) . " so please choose a different " .
"$desc instead of $dir .\n\n";
}
my $testFileServer = `$getFileServer $dir/`;
if (scalar(grep /^$testFileServer$/, @fileServerNoNo)) {
die "\ncluster outputs are forbidden to go to fileservers " .
join (' or ', @fileServerNoNo) . " so please choose a different " .
"$desc instead of $dir (which is hosted on $testFileServer).\n\n";
}
}
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
@HgAutomate::commonOptionSpec,
"blastzOutRoot=s",
"swap",
"chainMinScore=i",
"chainLinearGap=s",
"tRepeats=s",
"qRepeats=s",
"readmeOnly",
"ignoreSelf",
"syntenicNet",
"noDbNameCheck",
"inclHap",
"noLoadChainSplit",
"loadChainSplit"
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
if ($opt_swap) {
if ($opt_continue) {
if ($stepper->stepPrecedes($opt_continue, 'net')) {
warn "\nIf -swap is specified, then -continue must specify a step ".
"of \"net\" or later.\n";
&usage(1);
}
} else {
# If -swap is given but -continue is not, force -continue and tell
# $stepper to reevaluate options:
$opt_continue = 'chainMerge';
$err = $stepper->processOptions();
usage(1) if ($err);
}
if ($opt_stop) {
if ($stepper->stepPrecedes($opt_stop, 'chainMerge')) {
warn "\nIf -swap is specified, then -stop must specify a step ".
"of \"chainMerge\" or later.\n";
&usage(1);
}
}
}
if ($opt_blastzOutRoot) {
if ($opt_blastzOutRoot !~ m@^/\S+/\S+@) {
warn "\n-blastzOutRoot must specify a full path.\n";
&usage(1);
}
&enforceClusterNoNo($opt_blastzOutRoot, '-blastzOutRoot');
if (! &isInDirList($opt_blastzOutRoot, @clusterNAS)) {
warn "\n-blastzOutRoot is intended to specify something on " .
"$clusterNAS, but I'll trust your judgment " .
"and use $opt_blastzOutRoot\n\n";
}
}
$workhorse = $opt_workhorse if ($opt_workhorse);
$bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
$smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
}
#########################################################################
# The following routines were taken almost verbatim from blastz-run-ucsc,
# so may be good candidates for libification! unless that would slow down
# blastz-run-ucsc...
# nfsNoodge() was removed from loadDef() and loadSeqSizes() -- since this
# script will not be run on the cluster, we should fully expect files to
# be immediately visible.
sub loadDef {
# Read parameters from a bash script with Scott's param variable names:
my ($def) = @_;
my $fh = &HgAutomate::mustOpen("$def");
while (<$fh>) {
s/^\s*export\s+//;
next if (/^\s*#/ || /^\s*$/);
if (/(\w+)\s*=\s*(.*)/) {
my ($var, $val) = ($1, $2);
while ($val =~ /\$(\w+)/) {
my $subst = $defVars{$1};
if (defined $subst) {
$val =~ s/\$$1/$subst/;
} else {
die "Can't find value to substitute for \$$1 in $DEF var $var.\n";
}
}
$defVars{$var} = $val;
}
}
close($fh);
}
sub loadSeqSizes {
# Load up sequence -> size mapping from $sizeFile into $hashRef.
my ($sizeFile, $hashRef) = @_;
my $fh = &HgAutomate::mustOpen("$sizeFile");
while (<$fh>) {
chomp;
my ($seq, $size) = split;
$hashRef->{$seq} = $size;
}
close($fh);
}
# end shared stuff from blastz-run-ucsc
#########################################################################
sub requireVar {
my ($var) = @_;
die "Error: $DEF is missing variable $var\n" if (! defined $defVars{$var});
}
sub requirePath {
my ($var) = @_;
my $val = $defVars{$var};
die "Error: $DEF $var=$val must specify a complete path\n"
if ($val !~ m@^/\S+/\S+@);
if ( -d $val ) {
my $fileCount = `find $val -maxdepth 1 -type f | wc -l`;
chomp $fileCount;
if ($fileCount < 1) {
die "Error: $DEF variable: $var=$val specifies an empty directory.\n";
}
} elsif ( ! -s $val ) {
die "Error: $DEF variable: $var=$val is not a file or directory.\n";
}
}
sub requireNum {
my ($var) = @_;
my $val = $defVars{$var};
die "Error: $DEF variable $var=$val must specify a number.\n"
if ($val !~ /^\d+$/);
}
my $oldDbFormat = '[a-z][a-z](\d+)?';
my $newDbFormat = '[a-z][a-z][a-z][A-Z][a-z][a-z0-9](\d+)?';
sub getDbFromPath {
# Require that $val is a full path that contains a recognizable db as
# one of its elements (possibly the last one).
my ($var) = @_;
my $val = $defVars{$var};
my $db;
if ($opt_noDbNameCheck ||
$val =~ m@^/\S+/($oldDbFormat|$newDbFormat)((\.2bit)|(/(\S+)?))?$@) {
$db = $1;
} else {
die "Error: $DEF variable $var=$val must be a full path with " .
"a recognizable database as one of its elements.\n"
}
if (! defined($db)) {
if ($val =~ m#^/hive/data/genomes/#) {
$val =~ s#^/hive/data/genomes/##;
$val =~ s#/.*##;
$db = $val;
warn "Warning: assuming database $db from /hive/data/genomes/<db>/ path\n";
} elsif ($val =~ m#^/scratch/data/#) {
$val =~ s#^/scratch/data/##;
$val =~ s#/.*##;
$db = $val;
warn "Warning: assuming database $db from /scratch/data/<db>/ path\n";
}
}
return $db;
}
sub checkDef {
# Make sure %defVars contains what we need and looks consistent with
# our assumptions.
foreach my $s ('SEQ1_', 'SEQ2_') {
foreach my $req ('DIR', 'LEN', 'CHUNK', 'LAP') {
&requireVar("$s$req");
}
&requirePath($s . 'DIR');
&requirePath($s . 'LEN');
&requireNum($s . 'CHUNK');
&requireNum($s . 'LAP');
}
$tDb = &getDbFromPath('SEQ1_DIR');
$qDb = &getDbFromPath('SEQ2_DIR');
$isSelf = $opt_ignoreSelf ? 0 : ($tDb eq $qDb);
# special split on SEQ2 for Self alignments
$selfSplit = $defVars{'SEQ2_SELF'} || 0;
$QDb = $isSelf ? 'Self' : ucfirst($qDb);
if ($isSelf && $opt_swap) {
die "-swap is not supported for self-alignments\n" .
"($DEF has $tDb as both target and query).\n";
}
HgAutomate::verbose(1, "$DEF looks OK!\n" .
"\ttDb=$tDb\n\tqDb=$qDb\n\ts1d=$defVars{SEQ1_DIR}\n" .
"\tisSelf=$isSelf\n");
if ($defVars{'SEQ1_SMSK'} || $defVars{'SEQ2_SMSK'} ||
$defVars{'BLASTZ_ABRIDGE_REPEATS'}) {
&requireVar('BLASTZ_ABRIDGE_REPEATS');
foreach my $s ('SEQ1_', 'SEQ2_') {
my $var = $s. 'SMSK';
&requireVar($var);
&requirePath($var);
}
HgAutomate::verbose(1, "Abridging repeats!\n");
}
}
sub doPartition {
# Partition the sequence up before blastz.
my $paraHub = $bigClusterHub;
my $runDir = "$buildDir/run.blastz";
my $targetList = "$tDb.lst";
my $queryList = $isSelf ? $targetList :
($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst");
if ($selfSplit) {
$queryList = "$qDb.selfSplit.lst"
}
my $tPartDir = '-lstDir tParts';
my $qPartDir = '-lstDir qParts';
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
my $seq1Limit = (defined $defVars{'SEQ1_LIMIT'}) ? $defVars{'SEQ1_LIMIT'} :
$defaultSeq1Limit;
my $seq2Limit = (defined $defVars{'SEQ2_LIMIT'}) ? $defVars{'SEQ2_LIMIT'} :
$defaultSeq2Limit;
my $seq2MaxLength = `awk '{print \$2}' $seq2Len | sort -rn | head -1`;
chomp $seq2MaxLength;
my $bundleParts = 0;
# OK to bundle parts list bits into 2bit files when not abridging
$bundleParts = 1 if ( ! $defVars{'BLASTZ_ABRIDGE_REPEATS'} );
my $partitionTargetCmd =
("$partition $defVars{SEQ1_CHUNK} $defVars{SEQ1_LAP} " .
"$seq1Dir $seq1Len -xdir xdir.sh -rawDir $outRoot $seq1Limit " .
"$tPartDir > $targetList");
my $partitionQueryCmd =
(($isSelf && (! $selfSplit)) ?
'# Self-alignment ==> use target partition for both.' :
"$partition $defVars{SEQ2_CHUNK} $defVars{SEQ2_LAP} " .
"$seq2Dir $seq2Len $seq2Limit " .
"$qPartDir > $queryList");
&HgAutomate::mustMkdir($runDir);
my $whatItDoes =
"It computes partitions of target and query sequences into chunks of the
specified size for the blastz cluster run. The actual splitting of
sequence is not performed here, but later on by blastz cluster jobs.";
my $bossScript = newBash HgRemoteScript("$runDir/doPartition.bash", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
$partitionTargetCmd
export L1=`wc -l < $targetList`
$partitionQueryCmd
export L2=`wc -l < $queryList`
export L=`echo \$L1 \$L2 | awk '{print \$1*\$2}'`
echo "cluster batch jobList size: \$L = \$L1 * \$L2"
_EOF_
);
if ($bundleParts) {
$bossScript->add(<<_EOF_
if [ -d tParts ]; then
echo 'constructing tParts/*.2bit files'
ls tParts/*.lst | sed -e 's#tParts/##; s#.lst##;' | while read tPart
do
sed -e 's#.*.2bit:##;' tParts/\$tPart.lst \\
| twoBitToFa -seqList=stdin $seq1Dir stdout \\
| faToTwoBit stdin tParts/\$tPart.2bit
done
fi
if [ -d qParts ]; then
echo 'constructing qParts/*.2bit files'
ls qParts/*.lst | sed -e 's#qParts/##; s#.lst##;' | while read qPart
do
sed -e 's#.*.2bit:##;' qParts/\$qPart.lst \\
| twoBitToFa -seqList=stdin $seq2Dir stdout \\
| faToTwoBit stdin qParts/\$qPart.2bit
done
fi
_EOF_
);
}
$bossScript->execute();
my $mkOutRootHost = $opt_blastzOutRoot ? $paraHub : $fileServer;
my $mkOutRoot = $opt_blastzOutRoot ? "mkdir -p $opt_blastzOutRoot;" : "";
&HgAutomate::run("$HgAutomate::runSSH $mkOutRootHost " .
"'(cd $runDir; $mkOutRoot csh -ef xdir.sh)'");
}
sub doBlastzClusterRun {
# Set up and perform the big-cluster blastz run.
my $paraHub = $bigClusterHub;
my $runDir = "$buildDir/run.blastz";
my $targetList = "$tDb.lst";
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $queryList = $isSelf ? $targetList :
($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst");
if ($selfSplit) {
$queryList = "$qDb.selfSplit.lst"
}
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doBlastzClusterRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue cat or some later " .
"stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doBlastzClusterRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Second, make sure we got through the partitioning already
if (! -e "$runDir/$targetList" && ! $opt_debug) {
die "doBlastzClusterRun: there's no target list file " .
"so start over without the -continue align.\n";
}
if (! -e "$runDir/$queryList" && ! $opt_debug) {
die "doBlastzClusterRun: there's no query list file" .
"so start over without the -continue align.\n";
}
my $templateCmd = ("$blastzRunUcsc -outFormat psl " .
($isSelf ? '-dropSelf ' : '') .
'$(path1) $(path2) ../DEF ' .
'{check out exists ' .
$outRoot . '/$(file1)/$(file1)_$(file2).psl }');
&HgAutomate::makeGsub($runDir, $templateCmd);
`touch "$runDir/para_hub_$paraHub"`;
my $whatItDoes = "It sets up and performs the big cluster blastz run.";
my $bossScript = new HgRemoteScript("$runDir/doClusterRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
$HgAutomate::gensub2 $targetList $queryList gsub jobList
$HgAutomate::paraRun
_EOF_
);
$bossScript->execute();
} # sub doBlastzClusterRun {}
sub doCatRun {
# Do a small cluster run to concatenate the lowest level of chunk result
# files from the big cluster blastz run. This brings results up to the
# next level: per-target-chunk results, which may still need to be
# concatenated into per-target-sequence in the next step after this one --
# chaining.
my $paraHub = $smallClusterHub;
my $runDir = "$buildDir/run.cat";
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doCatRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue chainRun or some later " .
"stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doCatRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/run.blastz/run.time";
if (! -e $successFile && ! $opt_debug) {
die "doCatRun: looks like previous stage was not successful (can't find " .
"$successFile).\n";
}
&HgAutomate::mustMkdir($runDir);
&HgAutomate::makeGsub($runDir,
"./cat.csh \$(path1) {check out exists ../pslParts/\$(file1).psl.gz}");
`touch "$runDir/para_hub_$paraHub"`;
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $fh = &HgAutomate::mustOpen(">$runDir/cat.csh");
print $fh <<_EOF_
#!/bin/csh -ef
find $outRoot/\$1/ -name "*.psl" | xargs cat | gzip -c > \$2
_EOF_
;
close($fh);
my $whatItDoes =
"It sets up and performs a small cluster run to concatenate all files in
each subdirectory of $outRoot into a per-target-chunk file.";
my $bossScript = new HgRemoteScript("$runDir/doCatRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
(cd $outRoot; find . -maxdepth 1 -type d | grep '^./') \\
| sed -e 's#/\$##; s#^./##' > tParts.lst
chmod a+x cat.csh
$HgAutomate::gensub2 tParts.lst single gsub jobList
mkdir ../pslParts
$HgAutomate::paraRun
_EOF_
);
$bossScript->execute();
} # sub doCatRun {}
sub makePslPartsLst {
# Create a pslParts.lst file the subdirectories of pslParts; if some
# are for subsequences of the same sequence, make a single .lst line
# for the sequence (single chaining job with subseqs' alignments
# catted together). Otherwise (i.e. subdirs that contain small
# target seqs glommed together by partitionSequences) make one .lst
# line per partition.
return if ($opt_debug);
opendir(P, "$buildDir/pslParts")
|| die "Couldn't open directory $buildDir/pslParts for reading: $!\n";
my @parts = readdir(P);
closedir(P);
my $partsLst = "$buildDir/axtChain/run/pslParts.lst";
my $fh = &HgAutomate::mustOpen(">$partsLst");
my %seqs = ();
my $count = 0;
foreach my $p (@parts) {
$p =~ s@^/.*/@@; $p =~ s@/$@@;
$p =~ s/\.psl\.gz//;
next if ($p eq '.' || $p eq '..');
if ($p =~ m@^(\S+:\S+):\d+-\d+$@) {
# Collapse subsequences (subranges of a sequence) down to one entry
# per sequence:
$seqs{$1} = 1;
} else {
print $fh "$p\n";
$count++;
}
}
foreach my $p (keys %seqs) {
print $fh "$p:\n";
$count++;
}
close($fh);
if ($count < 1) {
die "makePslPartsLst: didn't find any pslParts/ items.";
}
}
sub doChainRun {
# Do a small cluster run to chain alignments to each target sequence.
my $paraHub = $smallClusterHub;
my $runDir = "$buildDir/axtChain/run";
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doChainRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue chainMerge or some " .
"later stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doChainRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/run.cat/run.time";
if (! -e $successFile && ! $opt_debug) {
die "doChainRun: looks like previous stage was not successful (can't " .
"find $successFile).\n";
}
&HgAutomate::mustMkdir($runDir);
&HgAutomate::makeGsub($runDir,
"chain.csh \$(file1) {check out line+ chain/\$(file1).chain}");
`touch "$runDir/para_hub_$paraHub"`;
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $matrix = $defVars{'BLASTZ_Q'} ? "-scoreScheme=$defVars{BLASTZ_Q} " : "";
my $minScore = $opt_chainMinScore ? "-minScore=$opt_chainMinScore" : "";
my $linearGap = $opt_chainLinearGap ? "-linearGap=$opt_chainLinearGap" :
"-linearGap=$defaultChainLinearGap";
my $fh = &HgAutomate::mustOpen(">$runDir/chain.csh");
print $fh <<_EOF_
#!/bin/csh -ef
zcat ../../pslParts/\$1*.psl.gz \\
| axtChain -psl -verbose=0 $matrix $minScore $linearGap stdin \\
$seq1Dir \\
$seq2Dir \\
stdout \\
| chainAntiRepeat $seq1Dir \\
$seq2Dir \\
stdin \$2
_EOF_
;
if (exists($defVars{'SEQ1_LIFT'})) {
print $fh <<_EOF_
set c=\$2:t:r
echo "lifting \$2 to \${c}.lifted.chain"
liftUp liftedChain/\${c}.lifted.chain \\
$defVars{'SEQ1_LIFT'} carry \$2
rm \$2
mv liftedChain/\${c}.lifted.chain \$2
_EOF_
;
}
if (exists($defVars{'SEQ2_LIFT'})) {
print $fh <<_EOF_
set c=\$2:t:r
echo "lifting \$2 to \${c}.lifted.chain"
liftUp -chainQ liftedChain/\${c}.lifted.chain \\
$defVars{'SEQ2_LIFT'} carry \$2
rm \$2
mv liftedChain/\${c}.lifted.chain \$2
_EOF_
;
}
close($fh);
&makePslPartsLst();
my $whatItDoes =
"It sets up and performs a small cluster run to chain all alignments
to each target sequence.";
my $bossScript = new HgRemoteScript("$runDir/doChainRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
chmod a+x chain.csh
$HgAutomate::gensub2 pslParts.lst single gsub jobList
mkdir chain liftedChain
$HgAutomate::paraRun
rmdir liftedChain
_EOF_
);
$bossScript->execute();
} # sub doChainRun {}
sub postProcessChains {
# chainMergeSort etc.
my $runDir = "$buildDir/axtChain";
my $chain = "$tDb.$qDb.all.chain.gz";
# First, make sure we're starting clean.
if (-e "$runDir/$chain") {
die "postProcessChains: looks like this was run successfully already " .
"($chain exists). Either run with -continue net or some later " .
"stage, or move aside/remove $runDir/$chain and run again.\n";
} elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") {
die "postProcessChains: looks like this was run successfully already " .
"(all.chain[.gz] exists). Either run with -continue net or some later " .
"stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n";
} elsif (-e "$runDir/chain" && ! $opt_debug) {
die "postProcessChains: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/chain and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/axtChain/run/run.time";
if (! -e $successFile && ! $opt_debug) {
die "postProcessChains: looks like previous stage was not successful " .
"(can't find $successFile).\n";
}
my $cmd="$HgAutomate::runSSH $workhorse nice ";
$cmd .= "'find $runDir/run/chain -name \"*.chain\" ";
$cmd .= "| chainMergeSort -inputList=stdin ";
$cmd .= "| nice gzip -c > $runDir/$chain'";
&HgAutomate::run($cmd);
if ($splitRef) {
&HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
"chainSplit $runDir/chain $runDir/$chain");
}
&HgAutomate::nfsNoodge("$runDir/$chain");
} # sub postProcessChains {}
sub getAllChain {
# Find the most likely candidate for all.chain from a previous run/step.
my ($runDir) = @_;
my $chain;
if (-e "$runDir/$tDb.$qDb.all.chain.gz") {
$chain = "$tDb.$qDb.all.chain.gz";
} elsif (-e "$runDir/$tDb.$qDb.all.chain") {
$chain = "$tDb.$qDb.all.chain";
} elsif (-e "$runDir/all.chain.gz") {
$chain = "all.chain.gz";
} elsif (-e "$runDir/all.chain") {
$chain = "all.chain";
} elsif ($opt_debug) {
$chain = "$tDb.$qDb.all.chain.gz";
}
return $chain;
}
sub swapChains {
# chainMerge step for -swap: chainSwap | chainSort.
my $runDir = "$swapDir/axtChain";
my $inChain = &getAllChain("$buildDir/axtChain");
my $swappedChain = "$qDb.$tDb.all.chain.gz";
# First, make sure we're starting clean.
if (-e "$runDir/$swappedChain") {
die "swapChains: looks like this was run successfully already " .
"($runDir/$swappedChain exists). Either run with -continue net or some " .
"later stage, or move aside/remove $runDir/$swappedChain and run again.\n";
} elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") {
die "swapChains: looks like this was run successfully already " .
"($runDir/all.chain[.gz] exists). Either run with -continue net or some " .
"later stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n";
}
# Main routine already made sure that $buildDir/axtChain/all.chain is there.
&HgAutomate::run("$HgAutomate::runSSH $workhorse nice " .
"'chainSwap $buildDir/axtChain/$inChain stdout " .
"| nice chainSort stdin stdout " .
"| nice gzip -c > $runDir/$swappedChain'");
&HgAutomate::nfsNoodge("$runDir/$swappedChain");
if ($splitRef) {
&HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
"chainSplit $runDir/chain $runDir/$swappedChain");
}
} # sub swapChains {}
sub swapGlobals {
# Swap our global variables ($buildDir, $tDb, $qDb and %defVars SEQ1/SEQ2)
# so that the remaining steps need no tweaks for -swap.
$buildDir = $swapDir;
my $tmp = $qDb;
$qDb = $tDb;
$tDb = $tmp;
$QDb = $isSelf ? 'Self' : ucfirst($qDb);
foreach my $var ('DIR', 'LEN', 'CHUNK', 'LAP', 'SMSK') {
$tmp = $defVars{"SEQ1_$var"};
$defVars{"SEQ1_$var"} = $defVars{"SEQ2_$var"};
$defVars{"SEQ2_$var"} = $tmp;
}
$defVars{'BASE'} = $swapDir;
}
sub doChainMerge {
# If -swap, swap chains from other org; otherwise, merge the results
# from the chainRun step.
if ($opt_swap) {
&swapChains();
&swapGlobals();
} else {
&postProcessChains();
}
}
sub netChains {
# Turn chains into nets (,axt,maf,.over.chain).
# Don't do this for self alignments.
return if ($isSelf);
my $runDir = "$buildDir/axtChain";
# First, make sure we're starting clean.
if (-d "$buildDir/mafNet") {
die "netChains: looks like this was run successfully already " .
"(mafNet exists). Either run with -continue load or some later " .
"stage, or move aside/remove $buildDir/mafNet " .
"and $runDir/noClass.net and run again.\n";
} elsif (-e "$runDir/noClass.net") {
die "netChains: looks like we are not starting with a " .
"clean slate. Please move aside or remove $runDir/noClass.net " .
"and run again.\n";
}
# Make sure previous stage was successful.
my $chain = &getAllChain($runDir);
if (! defined $chain && ! $opt_debug) {
die "netChains: looks like previous stage was not successful " .
"(can't find [$tDb.$qDb.]all.chain[.gz]).\n";
}
my $whatItDoes =
"It generates nets (without repeat/gap stats -- those are added later on
$dbHost) from chains, and generates axt, maf and .over.chain from the nets.";
my $bossScript = new HgRemoteScript("$runDir/netChains.csh", $workhorse,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
chainPreNet $inclHap $chain $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout \\
| chainNet $inclHap stdin -minSpace=1 $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout /dev/null \\
| netSyntenic stdin noClass.net
# Make liftOver chains:
netChainSubset -verbose=0 noClass.net $chain stdout \\
| chainStitchId stdin stdout | gzip -c > $tDb.$qDb.over.chain.gz
_EOF_
);
my $seq1Dir = $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_DIR'};
if ($splitRef) {
$bossScript->add(<<_EOF_
# Make axtNet for download: one .axt per $tDb seq.
netSplit noClass.net net
cd ..
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt \$f axtChain/chain/\$f:t:r.chain \\
$seq1Dir $seq2Dir stdout \\
| axtSort stdin stdout \\
| gzip -c > axtNet/\$f:t:r.$tDb.$qDb.net.axt.gz
end
# Make mafNet for multiz: one .maf per $tDb seq.
mkdir mafNet
foreach f (axtNet/*.$tDb.$qDb.net.axt.gz)
axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. \$f \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > mafNet/\$f:t:r:r:r:r:r.maf.gz
end
_EOF_
);
} else {
$bossScript->add(<<_EOF_
# Make axtNet for download: one .axt for all of $tDb.
mkdir ../axtNet
netToAxt -verbose=0 noClass.net $chain \\
$seq1Dir $seq2Dir stdout \\
| axtSort stdin stdout \\
| gzip -c > ../axtNet/$tDb.$qDb.net.axt.gz
# Make mafNet for multiz: one .maf for all of $tDb.
mkdir ../mafNet
axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. ../axtNet/$tDb.$qDb.net.axt.gz \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > ../mafNet/$tDb.$qDb.net.maf.gz
_EOF_
);
}
$bossScript->execute();
} # sub netChains {}
sub loadUp {
# Load chains; add repeat/gap stats to net; load nets.
my $runDir = "$buildDir/axtChain";
my $QDbLink = "chain$QDb" . "Link";
# First, make sure we're starting clean.