diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 00e2f69..0000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "STAR"] - path = star-sys/STAR - url = https://github.com/10XGenomics/STAR.git - branch = orbit diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f477ec9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 10x Genomics + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/star-sys/STAR b/star-sys/STAR deleted file mode 160000 index 87af899..0000000 --- a/star-sys/STAR +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 87af89958ff11d84c5d7cd56285cc4cb9f9384b6 diff --git a/star-sys/STAR/LICENSE b/star-sys/STAR/LICENSE new file mode 100644 index 0000000..7f2dabd --- /dev/null +++ b/star-sys/STAR/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Alexander Dobin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/star-sys/STAR/source/1.fastq b/star-sys/STAR/source/1.fastq new file mode 100644 index 0000000..b465e0f --- /dev/null +++ b/star-sys/STAR/source/1.fastq @@ -0,0 +1,2220 @@ +@D000684:779:H53GNBCXY:1:1101:1303:2361 3:N:0:0 +GTGCGGGGAGAAGTTTCAAGAAGGTTCTTATGGAAAAAAGGCTGTGAGCATAGAAAGCAGTCATAGGAGGTTGGGGAACTAGCTTGTCCCTCCCCACC ++ +GGGAGIGIIIGIIGGGGIIGGIGGAGGAGGAAG.GGIIIG0) { + bamInStream.seekg(std::ios::beg); + bamInStream.read(bamIn+bamInBytes,s1);//read the whole file + } else if (s1<0) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: failed reading from temporary file: " << dirBAMsort+to_string(it)+"/"+to_string((uint) iBin); + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, 1, P); + }; + bamInBytes += bamInStream.gcount(); + bamInStream.close(); + remove(bamInFile.c_str()); + }; + if (bamInBytes!=binS) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: number of bytes expected from the BAM bin does not agree with the actual size on disk: "; + errOut << "Expected bin size=" <logMain, 1, P); + }; + + //extract coordinates + + for (uint ib=0,ia=0;ia); + + BGZF *bgzfBin; + bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P.outBAMcompression)).c_str()); + if (bgzfBin==NULL) { + ostringstream errOut; + errOut <<"EXITING because of fatal ERROR: could not open temporary bam file: " << dirBAMsort+"/b"+to_string((uint) iBin) << "\n"; + errOut <<"SOLUTION: check that the disk is not full, increase the max number of open files with Linux command ulimit -n before running STAR"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P); + }; + + outBAMwriteHeader(bgzfBin,P.samHeaderSortedCoord,mapGen.chrNameAll,mapGen.chrLengthAll); + //send ordered aligns to bgzf one-by-one + for (uint ia=0;ialogMain, EXIT_CODE_PARAMETER, P); + }; + + outBAMwriteHeader(bgzfBin,P.samHeaderSortedCoord,mapGen.chrNameAll,mapGen.chrLengthAll); + + + vector bamInFile; + std::map startPos; + + for (uint it=0; it bamSize(bamInFile.size(),0);//record sizes + + //allocate arrays + char **bamIn=new char* [bamInFile.size()]; + ifstream *bamInStream = new ifstream [bamInFile.size()]; + + for (uint it=0; it0) { + uint it=startPos.begin()->second; + uint startNext=startPos.size()>1 ? (++startPos.begin())->first : (uint) -1; + + while (true) { + bgzf_write(bgzfBin, bamIn[it], bamSize.at(it)); + bamInStream[it].read(bamIn[it],sizeof(int32));//read record size + if (bamInStream[it].good()) { + bamSize[it]=((*(uint32*)bamIn[it])+sizeof(int32)); + bamInStream[it].read(bamIn[it]+sizeof(int32),bamSize.at(it)-sizeof(int32)+sizeof(uint));//read the rest of the record, including la$ + uint iRead=*(uint*)(bamIn[it]+bamSize.at(it)); + if (iRead>startNext) {//this read from this chunk is > than a read from another chunk + startPos[iRead]=it; + break; + }; + } else {//nothing to do here, reached the end of the file + break; + }; + }; + startPos.erase(startPos.begin()); + }; + + bgzf_flush(bgzfBin); + bgzf_close(bgzfBin); + + + for (uint it=0; it +#include "BAMfunctions.h" +#include "htslib/htslib/kstring.h" + +string bam_cigarString (bam1_t *b) {//output CIGAR string +// kstring_t strK; +// kstring_t *str=&strK; + const bam1_core_t *c = &b->core; + + string cigarString(""); + if ( c->n_cigar > 0 ) { + uint32_t *cigar = bam_get_cigar(b); + for (int i = 0; i < c->n_cigar; ++i) { + cigarString+=to_string((uint)bam_cigar_oplen(cigar[i]))+bam_cigar_opchr(cigar[i]); + }; + }; + + +// if (c->n_cigar) { // cigar +// for (int i = 0; i < c->n_cigar; ++i) { +// kputw(bam_cigar_oplen(cigar[i]), str); +// kputc(bam_cigar_opchr(cigar[i]), str); +// } +// } else kputc('*', str); +// +// string cigarString (str->s,str->l); + return cigarString; +}; + +int bam_read1_fromArray(char *bamChar, bam1_t *b) //modified from samtools bam_read1 to assign BAM record in mmemry to bam structure +{ + bam1_core_t *c = &b->core; + int32_t block_len; //, ret, i; +// // uint32_t x[8]; +// // if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { +// // if (ret == 0) return -1; // normal end-of-file +// // else return -2; // truncated +// // } + uint32_t *x; + + uint32_t *bamU32=(uint32_t*) bamChar; + block_len=bamU32[0]; + +// // if (bgzf_read(fp, x, 32) != 32) return -3; +// // if (fp->is_be) { +// // ed_swap_4p(&block_len); +// // for (i = 0; i < 8; ++i) ed_swap_4p(x + i); +// // } + x=bamU32+1; + + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->l_data = block_len - 32; + if (b->l_data < 0 || c->l_qseq < 0) return -4; + if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data) + return -4; + if (b->m_data < b->l_data) { + b->m_data = b->l_data; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + if (!b->data) + return -4; + } +// // if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4; +// // //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; +// // if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + b->data=(uint8_t*) bamChar+4*9; + + return 4 + block_len; +} + + +void outBAMwriteHeader (BGZF* fp, const string &samh, const vector &chrn, const vector &chrl) { + throw std::runtime_error("Unimplemented!"); + //bgzf_write(fp,"BAM\001",4); + int32 hlen=samh.size(); + //bgzf_write(fp,(char*) &hlen,sizeof(hlen)); + //bgzf_write(fp,samh.c_str(),hlen); + int32 nchr=(int32) chrn.size(); + //bgzf_write(fp,(char*) &nchr,sizeof(nchr)); + for (int32 ii=0;ii +TintType bamAttributeInt(const char *bamAux, const char *attrName) {//not tested!!! + const char *attrStart=strstr(bamAux,attrName); + if (attrStart==NULL) return (TintType) -1; + switch (attrStart[2]) { + case ('c'): + return (TintType) *(int8_t*)(attrStart+3); + case ('s'): + return (TintType) *(int16_t*)(attrStart+3); + case ('i'): + return (TintType) *(int32_t*)(attrStart+3); + case ('C'): + return (TintType) *(uint8_t*)(attrStart+3); + case ('S'): + return (TintType) *(uint16_t*)(attrStart+3); + case ('I'): + return (TintType) *(uint32_t*)(attrStart+3); + }; +}; diff --git a/star-sys/STAR/source/BAMfunctions.h b/star-sys/STAR/source/BAMfunctions.h new file mode 100644 index 0000000..51f3dcc --- /dev/null +++ b/star-sys/STAR/source/BAMfunctions.h @@ -0,0 +1,10 @@ +#ifndef DEF_BAMfunctions +#define DEF_BAMfunctions + +#include "IncludeDefine.h" +#include SAMTOOLS_BGZF_H +#include SAMTOOLS_SAM_H + void outBAMwriteHeader (BGZF* fp, const string &samh, const vector &chrn, const vector &chrl); + int bam_read1_fromArray(char *bamChar, bam1_t *b); + string bam_cigarString (bam1_t *b); +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/BAMoutput.cpp b/star-sys/STAR/source/BAMoutput.cpp new file mode 100644 index 0000000..356c4d5 --- /dev/null +++ b/star-sys/STAR/source/BAMoutput.cpp @@ -0,0 +1,187 @@ +#include +#include "BAMoutput.h" +#include +#include "GlobalVariables.h" +#include +#include "serviceFuns.cpp" +#include "ThreadControl.h" +#include "streamFuns.h" + +BAMoutput::BAMoutput (int iChunk, string tmpDir, Parameters &Pin) : P(Pin){//allocate bam array + + nBins=P.outBAMcoordNbins; + binSize=P.chunkOutBAMsizeBytes/nBins; + bamArraySize=binSize*nBins; + bamArray = new char [bamArraySize]; + + bamDir=tmpDir+to_string((uint) iChunk);//local directory for this thread (iChunk) + + mkdir(bamDir.c_str(),P.runDirPerm); + binStart=new char* [nBins]; + binBytes=new uint64 [nBins]; + binStream=new ofstream* [nBins]; + binTotalN=new uint [nBins]; + binTotalBytes=new uint [nBins]; + for (uint ii=0;ii bamArraySize) {//write out this buffer + throw std::runtime_error("Unimplemented!"); + //bgzf_write(bgzfBAM,bamArray,binBytes1); + + binBytes1=0;//rewind the buffer + }; + + memcpy(bamArray+binBytes1, bamIn, bamSize); + binBytes1 += bamSize; + +}; + +void BAMoutput::unsortedFlush () {//flush all alignments + throw std::runtime_error("Unimplemented!"); + //bgzf_write(bgzfBAM,bamArray,binBytes1); + binBytes1=0;//rewind the buffer +}; + +void BAMoutput::coordOneAlign (char *bamIn, uint bamSize, uint iRead) { + + uint32 *bamIn32; + uint alignG; + uint32 iBin=0; + + if (bamSize==0) { + return; //no output, could happen if one of the mates is not mapped + } else { + //determine which bin this alignment belongs to + bamIn32=(uint32*) bamIn; + alignG=( ((uint) bamIn32[1]) << 32 ) | ( (uint)bamIn32[2] ); + if (bamIn32[1] == ((uint32) -1) ) {//unmapped + iBin=P.outBAMcoordNbins-1; + } else if (nBins>1) {//bin starts have already been determined + iBin=binarySearch1a (alignG, P.outBAMsortingBinStart, (int32) (nBins-1)); + }; + }; + +// if ( alignG == (uint32) -1 ) {//unmapped alignment, last bin +// iBin=nBins-1; +// } else { +// iBin=(alignG + chrStart)/binGlen; +// }; + + //write buffer is filled + if (binBytes[iBin]+bamSize+sizeof(uint) > ( (iBin>0 || nBins>1) ? binSize : binSize1) ) {//write out this buffer + if ( nBins>1 || iBin==(P.outBAMcoordNbins-1) ) {//normal writing, bins have already been determined + binStream[iBin]->write(binStart[iBin],binBytes[iBin]); + binBytes[iBin]=0;//rewind the buffer + } else {//the first chunk of reads was written in one bin, need to determine bin sizes, and re-distribute reads into bins + coordBins(); + coordOneAlign (bamIn, bamSize, iRead);//record the current align into the new bins + return; + }; + }; + + //record this alignment in its bin + memcpy(binStart[iBin]+binBytes[iBin], bamIn, bamSize); + binBytes[iBin] += bamSize; + memcpy(binStart[iBin]+binBytes[iBin], &iRead, sizeof(uint)); + binBytes[iBin] += sizeof(uint); + binTotalBytes[iBin] += bamSize+sizeof(uint); + binTotalN[iBin] += 1; + return; +}; + +void BAMoutput::coordBins() {//define genomic starts for bins + nBins=P.outBAMcoordNbins;//this is the true number of bins + + //mutex here + if (P.outBAMsortingBinStart[0]!=0) {//it's set to 0 only after the bin sizes are determined + //extract coordinates and sort + uint *startPos = new uint [binTotalN[0]+1];//array of aligns start positions + for (uint ib=0,ia=0;ialogMain << "BAM sorting: "<logMain << "BAM sorting bins genomic start loci:\n"; + + P.outBAMsortingBinStart[0]=0; + for (uint32 ib=1; ib<(nBins-1); ib++) { + P.outBAMsortingBinStart[ib]=startPos[binTotalN[0]/(nBins-1)*ib]; + P.inOut->logMain << ib <<"\t"<< (P.outBAMsortingBinStart[ib]>>32) << "\t" << ((P.outBAMsortingBinStart[ib]<<32)>>32) <write(binStart[iBin],binBytes[iBin]); + binStream[iBin]->flush(); + binBytes[iBin]=0;//rewind the buffer + }; +}; + +void BAMoutput::coordUnmappedPrepareBySJout () {//flush all alignments + uint iBin=P.outBAMcoordNbins-1; + binStream[iBin]->write(binStart[iBin],binBytes[iBin]); + binStream[iBin]->flush(); + binBytes[iBin]=0;//rewind the buffer + binStream[iBin]->close(); + binStream[iBin]->open((bamDir +"/"+to_string(iBin)+".BySJout").c_str()); +}; diff --git a/star-sys/STAR/source/BAMoutput.h b/star-sys/STAR/source/BAMoutput.h new file mode 100644 index 0000000..bc939ea --- /dev/null +++ b/star-sys/STAR/source/BAMoutput.h @@ -0,0 +1,37 @@ +#ifndef CODE_BAMoutput +#define CODE_BAMoutput + +#include "IncludeDefine.h" +#include SAMTOOLS_BGZF_H +#include "Parameters.h" + +class BAMoutput {// +public: + //sorted output + BAMoutput (int iChunk, string tmpDir, Parameters &Pin); + void coordOneAlign (char *bamIn, uint bamSize, uint iRead); + void coordBins (); + void coordFlush (); + //unsorted output + BAMoutput (BGZF *bgzfBAMin, Parameters &Pin); + void unsortedOneAlign (char *bamIn, uint bamSize, uint bamSize2); + void unsortedFlush (); + void coordUnmappedPrepareBySJout(); + + uint32 nBins; //number of bins to split genome into + uint* binTotalN; //total number of aligns in each bin + uint* binTotalBytes;//total size of aligns in each bin +private: + uint64 bamArraySize; //this size will be allocated + char* bamArray; //large array to store the bam alignments, pre-sorted + uint64 binSize, binSize1;//storage size of each bin + uint64 binGlen;//bin genomic length + char **binStart; //pointers to starts of the bins + uint64 *binBytes, binBytes1;//number of bytes currently written to each bin + ofstream **binStream;//output streams for each bin + BGZF *bgzfBAM; + Parameters &P; + string bamDir; +}; + +#endif diff --git a/star-sys/STAR/source/Chain.cpp b/star-sys/STAR/source/Chain.cpp new file mode 100644 index 0000000..2cff52a --- /dev/null +++ b/star-sys/STAR/source/Chain.cpp @@ -0,0 +1,126 @@ +#include "Chain.h" +#include "streamFuns.h" +#include "serviceFuns.cpp" + +Chain::Chain(Parameters &Pin, string chainFileNameIn) : P(Pin), chainFileName(chainFileNameIn) +{ + chainLoad(); +}; + +void Chain::chainLoad() +{ + ifstream &streamIn = ifstrOpen(chainFileName, ERROR_OUT, "SOLUTION: check path and permission for the chain file" + chainFileName, P); + + string chr1;//current chromsome 1 (old) + + while (streamIn.good()) + { + string line1; + getline(streamIn,line1); + istringstream line1str(line1); + + vector fields(13); + + for (int ii=0;ii<4;ii++) + line1str >> fields[ii]; + if (fields[0]=="") + {//empty line, continue + } else if (fields[1]=="") + {//end of chain + chrChains[chr1].bLen.push_back(std::stoi(fields[0]));//read the last block length + chrChains[chr1].bN=chrChains[chr1].bLen.size(); + } else if (fields[3]=="") + {//normal chain block + chrChains[chr1].bLen.push_back(std::stoi(fields[0])); + + uint s=chrChains[chr1].bStart1.back() + chrChains[chr1].bLen.back() + std::stoi(fields[1]);//prev start + length + shift + chrChains[chr1].bStart1.push_back(s); + + s=chrChains[chr1].bStart2.back() + chrChains[chr1].bLen.back() + std::stoi(fields[2]);//prev start + length + shift + chrChains[chr1].bStart2.push_back(s); + } else + {//chain header + //chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id + // 0 1 2 3 4 5 6 7 8 9 10 11 12 + + for (int ii=4;ii<13;ii++) + line1str >> fields[ii]; //read all the fields + + chr1=fields[2]; + chrChains[chr1].chr1=chr1; + chrChains[chr1].chr2=fields[7];//NOTE: the whole procedure (for now) only works for single chain per chromosome + chrChains[chr1].bStart1.push_back(std::stoi(fields[5])); + chrChains[chr1].bStart2.push_back(std::stoi(fields[10])); + }; + }; +}; + +void Chain::liftOverGTF(string gtfFileName, string outFileName) +{//simple arithmetic lift-over of the GTF file + ifstream &streamIn = ifstrOpen(gtfFileName, ERROR_OUT, "SOLUTION: check path and permission for the GTF file" + gtfFileName, P); + ofstream &streamOut = ofstrOpen(outFileName, ERROR_OUT, P); + ofstream &streamOutUnlifted = ofstrOpen(outFileName+".unlifted", ERROR_OUT, P); + + while (streamIn.good()) + { + string line1; + getline(streamIn,line1); + istringstream line1str(line1); + + string chr1; + line1str >> chr1; + + if (chr1=="" || chr1.substr(0,1)=="#") + continue;//empty or comment line + + if (chrChains.count(chr1)==0) + exitWithError("GTF contains chromosome " + chr1 + " not present in the chain file " + chainFileName,std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + + OneChain *ch1 = & chrChains[chr1];//the chain for the chr1 + + string str1,str2; + line1str >> str1 >> str2;//fields 2,3 + + uint c1, c2[2]; //coordinates: 1/2 (old/new) + + for (int ii=0;ii<2;ii++) + {//read and transform the two coordinates + line1str >> c1; + int32 i1 = binarySearch1a (c1, ch1->bStart1.data(), ch1->bN); + + c2[ii]=-1;//-1 means impossible to lift this end + + if (i1>=0 && c1 < ch1->bStart1[i1]+ch1->bLen[i1]) + {//c1 is inside the block, simple transformation + c2[ii]=ch1->bStart2[i1] + c1 - ch1->bStart1[i1]; + } else + {//c1 is outside of the block + if (ii==0 && i1 < (int32) ch1->bN-1) + {//left end => c2 will be at the start of the next block + c2[ii]=ch1->bStart2[i1+1]; //if i1=-1, it will work = start of the 0-tn blocl + } else if (ii==1 && i1 >= 0) + { + c2[ii]=ch1->bStart2[i1]+ch1->bLen[i1]-1; + }; + }; + }; + if (c2[0]!=-1llu && c2[1]!=-1llu && c2[1]>=c2[0]) + {//good conversion + streamOut << ch1->chr2 <<"\t"<< str1 <<"\t"<< str2 <<"\t"<logMain, EXIT_CODE_INPUT_FILES, P); +// }; +// uint ichr=mapGen.chrNameIndex[oldname];//chr index in the genome list +// bStart1[bN] += mapGen.chrLength[ichr];//whole genome chain - shift by chr start diff --git a/star-sys/STAR/source/Chain.h b/star-sys/STAR/source/Chain.h new file mode 100644 index 0000000..9b0339a --- /dev/null +++ b/star-sys/STAR/source/Chain.h @@ -0,0 +1,30 @@ +#ifndef DEF_Chain +#define DEF_Chain + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "ErrorWarning.h" + +class OneChain +{ + public: + uint bN; + string chr1,chr2;//1/2 (old/new) chr names + vector bStart1, bStart2, bLen; //blocks starts in 1/2, lengths +}; + +class Chain { + public: +// // uint bN;//number of blocks +// // vector bStart1, bStart2, bLen; //blocks starts in 1/2, lengths + + Chain(Parameters &Pin, string chainFileNameIn); + void liftOverGTF(string gtfFileName, string outFileName); + private: + Parameters &P; + string chainFileName; + void chainLoad(); + std::map chrChains; +}; + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/ChimericAlign.cpp b/star-sys/STAR/source/ChimericAlign.cpp new file mode 100644 index 0000000..032b683 --- /dev/null +++ b/star-sys/STAR/source/ChimericAlign.cpp @@ -0,0 +1,32 @@ +#include "ChimericAlign.h" + +ChimericAlign::ChimericAlign(ChimericSegment &seg1in, ChimericSegment &seg2in, int chimScoreIn, const Genome &genomeIn, ReadAlign *RAin) + : seg1(seg1in), seg2(seg2in),chimScore(chimScoreIn), P(seg1in.P), pCh(P.pCh), mapGen(genomeIn), RA(RAin) { + stitchingDone=false; + + al1=&seg1.align; + al2=&seg2.align; + + if (al1->roStart > al2->roStart) + swap (al1,al2); + + ex1 = al1->Str==1 ? 0 : al1->nExons-1; + ex2 = al2->Str==0 ? 0 : al2->nExons-1; +}; + +bool ChimericAlign::chimericCheck() { + bool chimGood=true; + + chimGood = chimGood && al1->exons[ex1][EX_iFrag] <= al2->exons[ex2][EX_iFrag];//otherwise - strange configuration, both segments contain two mates + //if ( trChim[0].exons[e0][EX_iFrag] > trChim[1].exons[e1][EX_iFrag] ) {//strange configuration, rare, similar to the next one + // chimN=0;//reject such chimeras + //good test example: + //CTTAGCTAGCAGCGTCTTCCCAGTGCCTGGAGGGCCAGTGAGAATGGCACCCTCTGGGATTTTTGCTCCTAGGTCT + //TTGAGGTGAAGTTCAAAGATGTGGCTGGCTGTGAGGAGGCCGAGCTAGAGATCATGGAATTTGTGAATTTCTTGAA + //} else + + //junction overhangs too short for chimerically spliced mates + chimGood = chimGood && (al1->exons[ex1][EX_iFrag] < al2->exons[ex2][EX_iFrag] || (al1->exons[ex1][EX_L] >= pCh.junctionOverhangMin && al2->exons[ex2][EX_L] >= pCh.junctionOverhangMin) ); + + return chimGood; +}; diff --git a/star-sys/STAR/source/ChimericAlign.h b/star-sys/STAR/source/ChimericAlign.h new file mode 100644 index 0000000..0258493 --- /dev/null +++ b/star-sys/STAR/source/ChimericAlign.h @@ -0,0 +1,40 @@ +#ifndef CODE_ChimericAlign +#define CODE_ChimericAlign + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ChimericSegment.h" +#include "Genome.h" + +#include + +class ReadAlign; + +class ChimericAlign +{// + public: + ChimericSegment seg1, seg2; //two chimeric segments +// std::unique_ptr al1, al2; //two chimeric alignments - modified by chimeric switching + Transcript *al1, *al2; + uint ex1, ex2; + + uint chimJ1, chimJ2, chimRepeat1, chimRepeat2; + int chimMotif, chimStr, chimScore; + + ChimericAlign(ChimericSegment &seg1in, ChimericSegment &seg2in, int chimScoreIn, const Genome &genomeIn, ReadAlign *RAin); //allocate + void chimericJunctionOutput(fstream &outStream, uint chimN, int maxNonChimAlignScore, bool PEmerged_flag, int chimScoreBest, int maxPossibleAlignScore); + void chimericStitching(char *genSeq, char **Read1); + bool chimericCheck(); + + bool stitchingDone; + + private: + const Parameters &P; + const ParametersChimeric &pCh; + const Genome &mapGen; + ReadAlign *RA; + +}; + +#endif diff --git a/star-sys/STAR/source/ChimericAlign_chimericJunctionOutput.cpp b/star-sys/STAR/source/ChimericAlign_chimericJunctionOutput.cpp new file mode 100644 index 0000000..330f563 --- /dev/null +++ b/star-sys/STAR/source/ChimericAlign_chimericJunctionOutput.cpp @@ -0,0 +1,23 @@ +#include "ChimericAlign.h" +#include "ReadAlign.h" + +void ChimericAlign::chimericJunctionOutput(fstream &outStream, uint chimN, int maxNonChimAlignScore, bool PEmerged_flag, int chimScoreBest, int maxPossibleAlignScore) +{ + outStream << mapGen.chrName[al1->Chr] <<"\t"<< chimJ1 - mapGen.chrStart[al1->Chr]+1 <<"\t"<< (al1->Str==0 ? "+":"-") \ + <<"\t"<< mapGen.chrName[al2->Chr] <<"\t"<< chimJ2 - mapGen.chrStart[al2->Chr]+1 <<"\t"<< (al2->Str==0 ? "+":"-") \ + <<"\t"<< chimMotif <<"\t"<< chimRepeat1 <<"\t"<< chimRepeat2 <<"\t"<< al1->readName+1 \ + <<"\t"<< al1->exons[0][EX_G] - mapGen.chrStart[al1->Chr]+1 <<"\t"<< al1->generateCigarP() \ + <<"\t"<< al2->exons[0][EX_G] - mapGen.chrStart[al2->Chr]+1 <<"\t"<< al2->generateCigarP() + <<"\t"<< chimN // number of multimapping chimeric alignments for this read. + << "\t" << maxPossibleAlignScore // the maximum possible alignment score (currently the sum of the (paired) read lengths) + << "\t" << maxNonChimAlignScore // trBest - the best alignment score from a non-chimeric alignment of this read to the ref genome. + << "\t" << chimScore // current chimeric alignment score + << "\t" << chimScoreBest // best chimeric score among multimapping chimeric alignments. + << "\t" << PEmerged_flag; // boolean indicating paired reads were merged into a single read before alignment & chimer detection. + + if (P.outSAMattrPresent.RG) + outStream <<"\t"<< P.outSAMattrRG.at(RA->readFilesIndex); + if (P.pSolo.type>0) + outStream <<"\t"<< RA->soloRead->readBar->cbSeq <<"\t"<< RA->soloRead->readBar->umiSeq; + outStream <<"\n"; //<<"\t"<< trChim[0].exons[0][EX_iFrag]+1 --- no need for that, since trChim[0] is always on the first mate +}; diff --git a/star-sys/STAR/source/ChimericAlign_chimericStitching.cpp b/star-sys/STAR/source/ChimericAlign_chimericStitching.cpp new file mode 100644 index 0000000..90ad22e --- /dev/null +++ b/star-sys/STAR/source/ChimericAlign_chimericStitching.cpp @@ -0,0 +1,181 @@ +#include "ChimericAlign.h" + +void ChimericAlign::chimericStitching(char *genSeq, char **Read1) { + + if (stitchingDone) + return; + + stitchingDone=true; + + char *readSeq=Read1[0]; //only direct read sequence is used - reverse complemented if necessary in the algorithm + + al1=new Transcript(*al1); + al2=new Transcript(*al2); + + Transcript &a1=*al1; + Transcript &a2=*al2;//to use instead of pointers + + chimStr = max(seg1.str,seg2.str); //segment strands are either equal, or one is zero - select the non-zero strand + + chimRepeat1=0; chimRepeat2=0; chimJ1=0; chimJ2=0; chimMotif=0; + + if ( a1.exons[ex1][EX_iFrag] < a2.exons[ex2][EX_iFrag] ) {//mates bracket the chimeric junction + chimMotif=-1; + if (a1.Str==1) {//negative strand + chimJ1=a1.exons[ex1][EX_G]-1; + } else { + chimJ1=a1.exons[ex1][EX_G]+a1.exons[ex1][EX_L]; + }; + if (a2.Str==0) {//positive strand + chimJ2=a2.exons[ex2][EX_G]-1; + } else { + chimJ2=a2.exons[ex2][EX_G]+a2.exons[ex2][EX_L]; + }; + } else {//chimeric junctions is within one of the mates, check and shift chimeric junction if necessary + uint roStart0 = a1.Str==0 ? a1.exons[ex1][EX_R] : a1.Lread - a1.exons[ex1][EX_R] - a1.exons[ex1][EX_L]; + uint roStart1 = a2.Str==0 ? a2.exons[ex2][EX_R] : a1.Lread - a2.exons[ex2][EX_R] - a2.exons[ex2][EX_L]; + + uint jR, jRbest=0; + int jScore=0,jMotif=0,jScoreBest=-999999,jScoreJ=0; + uint jRmax = roStart1+a2.exons[ex2][EX_L]; + jRmax = jRmax>roStart0 ? jRmax-roStart0-1 : 0; + for (jR=0; jR3 || b1>3) ) || bR>3) {//chimera is not called if there are Ns in the genome or in the read + chimScore=0; + return; + }; + + char b01,b02,b11,b12; + if (a1.Str==0) { + b01=genSeq[a1.exons[ex1][EX_G]+jR+1]; + b02=genSeq[a1.exons[ex1][EX_G]+jR+2]; + } else { + b01=genSeq[a1.exons[ex1][EX_G]+a1.exons[ex1][EX_L]-1-jR-1]; + if (b01<4) b01=3-b01; + b02=genSeq[a1.exons[ex1][EX_G]+a1.exons[ex1][EX_L]-1-jR-2]; + if (b02<4) b02=3-b02; + }; + if (a2.Str==0) { + b11=genSeq[a2.exons[ex2][EX_G]-roStart1+roStart0+jR-1]; + b12=genSeq[a2.exons[ex2][EX_G]-roStart1+roStart0+jR]; + } else { + b11=genSeq[a2.exons[ex2][EX_G]+a2.exons[ex2][EX_L]-1+roStart1-roStart0-jR+1]; + if (b11<4) b11=3-b11; + b12=genSeq[a2.exons[ex2][EX_G]+a2.exons[ex2][EX_L]-1+roStart1-roStart0-jR]; + if (b12<4) b12=3-b12; + }; + + jMotif=0; + if (b01==2 && b02==3 && b11==0 && b12==2) {//GTAG + if (chimStr!=2) { + jMotif=1; + }; + } else if(b01==1 && b02==3 && b11==0 && b12==1) {//CTAC + if (chimStr!=1) { + jMotif=2; + }; + }; + + if (bR==b0 && bR!=b1) { + jScore++; + } else if (bR!=b0 && bR==b1) { + jScore--; + }; + + jScoreJ =jMotif==0 ? jScore + P.pCh.scoreJunctionNonGTAG : jScore ; + + if ( jScoreJ > jScoreBest || (jScoreJ == jScoreBest && jMotif>0) ) { + chimMotif=jMotif; + jRbest=jR; + jScoreBest=jScoreJ; + }; + };//jR cycle + + + //shift junction in trChim + if (a1.Str==1) { + a1.exons[ex1][EX_R] +=a1.exons[ex1][EX_L]-jRbest-1; + a1.exons[ex1][EX_G] +=a1.exons[ex1][EX_L]-jRbest-1; + a1.exons[ex1][EX_L]=jRbest+1; + chimJ1=a1.exons[ex1][EX_G]-1; + } else { + a1.exons[ex1][EX_L]=jRbest+1; + chimJ1=a1.exons[ex1][EX_G]+a1.exons[ex1][EX_L]; + }; + + if (a2.Str==0) { + a2.exons[ex2][EX_R] +=roStart0+jRbest+1-roStart1; + a2.exons[ex2][EX_G] +=roStart0+jRbest+1-roStart1; + a2.exons[ex2][EX_L]=roStart1+a2.exons[ex2][EX_L]-roStart0-jRbest-1; + chimJ2=a2.exons[ex2][EX_G]-1; + } else { + a2.exons[ex2][EX_L]=roStart1+a2.exons[ex2][EX_L]-roStart0-jRbest-1; + chimJ2=a2.exons[ex2][EX_G]+a2.exons[ex2][EX_L]; + }; + //find repeats + char b0,b1; + for (jR=0;jR<100;jR++) {//forward check + if (a1.Str==0) { + b0=genSeq[chimJ1+jR]; + } else { + b0=genSeq[chimJ1-jR]; + if (b0<4) b0=3-b0; + }; + + if (a2.Str==0) { + b1=genSeq[chimJ2+1+jR]; + } else { + b1=genSeq[chimJ2-1-jR]; + if (b1<4) b1=3-b1; + }; + if (b0!=b1) break; + }; + chimRepeat2=jR; + for (jR=0;jR<100;jR++) {//reverse check + if (a1.Str==0) { + b0=genSeq[chimJ1-1-jR]; + } else { + b0=genSeq[chimJ1+1+jR]; + if (b0<4) b0=3-b0; + }; + + if (a2.Str==0) { + b1=genSeq[chimJ2-jR]; + } else { + b1=genSeq[chimJ2+jR]; + if (b1<4) b1=3-b1; + }; + if (b0!=b1) break; + }; + chimRepeat1=jR; + };//chimeric junction is within a mate + + if (chimMotif>=0 && (a1.exons[ex1][EX_L] chimAligns; + bool chimRecord; + int chimScoreBest; + + ChimericDetection(const Parameters &Pin, Transcript ***trAll, uint *nWinTr, char** Read1in, const Genome &genomeIn, fstream *ostreamChimJunctionIn, ReadAlign *RA); + bool chimericDetectionMult(uint nWin, uint *readLengthIn, int maxNonChimAlignScore, bool PEmerged_flag); + bool chimericDetectionMult(uint nWin, uint *readLengthIn); + fstream *ostreamChimJunction; +}; + +#endif diff --git a/star-sys/STAR/source/ChimericDetection_chimericDetectionMult.cpp b/star-sys/STAR/source/ChimericDetection_chimericDetectionMult.cpp new file mode 100644 index 0000000..1967d00 --- /dev/null +++ b/star-sys/STAR/source/ChimericDetection_chimericDetectionMult.cpp @@ -0,0 +1,133 @@ +//#include "blocksOverlap.h" +#include "ChimericDetection.h" +#include "ChimericSegment.h" + +int chimericAlignScore (ChimericSegment & seg1, ChimericSegment & seg2) +{ + int chimScore=0; + uint chimOverlap = seg2.roS>seg1.roS ? (seg2.roS>seg1.roE ? 0 : seg1.roE-seg2.roS+1) : (seg2.roE= seg1.align.readLength[0]) || (seg2.roE < seg1.align.readLength[0] && seg1.roS >= seg1.align.readLength[0]); + + //segment lengths && (different mates || small gap between segments) + if (seg1.roE > seg1.P.pCh.segmentMin + seg1.roS + chimOverlap && seg2.roE > seg1.P.pCh.segmentMin + seg2.roS + chimOverlap \ + && ( diffMates || ( (seg1.roE + seg1.P.pCh.segmentReadGapMax + 1) >= seg2.roS && (seg2.roE + seg1.P.pCh.segmentReadGapMax + 1) >= seg1.roS ) ) ) + { + chimScore = seg1.align.maxScore + seg2.align.maxScore - (int)chimOverlap; //subtract overlap to avoid double counting + }; + + return chimScore; +}; + +///////////////////////////////////////////////////////////// +bool ChimericDetection::chimericDetectionMult(uint nW, uint *readLength, int maxNonChimAlignScore, bool PEmerged_flag) { + + chimRecord=false; + +// for (uint ii=0;iistitchingDone) {//al1,al2 were allocated + delete cAit->al1; + delete cAit->al2; + }; + }; + + chimAligns.clear(); + chimScoreBest=0; + + int maxPossibleAlignScore = (int)(readLength[0]+readLength[1]); + + for (uint iW1=0; iW1 maxNonChimAlignScore + && + chimScore >= maxPossibleAlignScore - P.pCh.scoreDropMax + && + chimScore >= P.pCh.scoreMin + && + chimScore>=chimScoreBest-(int)P.pCh.multimapScoreRange + ) + {//candidate chimera + ChimericAlign chAl(seg1, seg2, chimScore, outGen, RA); + + if (!chAl.chimericCheck()) + continue; //check chimeric alignment + + //re-calculated chimScoreBest includes non-canonical penalty, so the re-calculated score is lower, in some cases it goes to 0 if some checks are not passed + chAl.chimericStitching(outGen.G, Read1); + // rescore after stitching. + if (chAl.chimScore > maxNonChimAlignScore) { // survived stitching. + chimAligns.push_back(chAl);//add this chimeric alignment + + if (chimAligns.back().chimScore > chimScoreBest) + chimScoreBest=chimAligns.back().chimScore; + }; // endif stitched chimera survived. + + }; // endif meets chim score criteria + };//cycle over window2 aligns + };//cycle over window2 + };//cycle over window1 aligns + };//cycle over window1 + + if (chimScoreBest==0) + return chimRecord; + + chimN=0; + for (auto cAit=chimAligns.begin(); cAitchimScore >= chimScoreBest - (int)P.pCh.multimapScoreRange) + ++chimN; + }; + + /* + if (chimN > 2*P.pCh.multimapNmax) //too many loci (considering 2* more candidates for stitching below) + return chimRecord; + + chimN=0; + for (auto cAit=chimAligns.begin(); cAitchimScore >= chimScoreBest-(int)P.pCh.multimapScoreRange) { + cAit->chimericStitching(outGen.G, Read1[0]); + if (cAit->chimScore >= chimScoreBest - (int)P.pCh.multimapScoreRange) + ++chimN; + }; + }; + */ + + if (chimN > P.pCh.multimapNmax) //too many loci + return chimRecord; + + for (auto cAit=chimAligns.begin(); cAitchimScore >= chimScoreBest-(int)P.pCh.multimapScoreRange) + cAit->chimericJunctionOutput(*ostreamChimJunction, chimN, maxNonChimAlignScore, PEmerged_flag, chimScoreBest, maxPossibleAlignScore); + }; + + if (chimN>0) + chimRecord=true; + + return chimRecord; +};//END diff --git a/star-sys/STAR/source/ChimericSegment.cpp b/star-sys/STAR/source/ChimericSegment.cpp new file mode 100644 index 0000000..4612d11 --- /dev/null +++ b/star-sys/STAR/source/ChimericSegment.cpp @@ -0,0 +1,32 @@ +#include "ChimericSegment.h" + +ChimericSegment::ChimericSegment(const Parameters &Pin, Transcript &alignIn) : P(Pin), pCh(Pin.pCh), align(alignIn) +{ + if ( (align.intronMotifs[1]==0 && align.intronMotifs[2]==0) || (align.intronMotifs[1]>0 && align.intronMotifs[2]>0)) {//strand is undefined + str=0; + } else if ( (align.Str==0) == (align.intronMotifs[1]>0)) {//strand the same as RNA. + //This assumes that the aligns have consistent strands, i.e. only intronMotifs[1]>0 OR intronMotifs[2]>0 + str=1; + } else {//strand opposite to RNA + str=2; + }; + roS=align.Str==0 ? align.exons[0][EX_R] : align.Lread - align.exons[align.nExons-1][EX_R] - align.exons[align.nExons-1][EX_L]; + roE=align.Str==0 ? align.exons[align.nExons-1][EX_R] + align.exons[align.nExons-1][EX_L] - 1 : align.Lread - align.exons[0][EX_R] - 1; + if (roS>align.readLength[0]) roS--; + if (roE>align.readLength[0]) roE--; +}; + +bool ChimericSegment::segmentCheck() +{ + bool segGood = true; + segGood = segGood && align.rLength >= pCh.segmentMin; //mapped length >= chim segmentMin + segGood = segGood && align.intronMotifs[0]==0; //no non-canonical unannotated juncions. + return segGood; + + //this is already tested for each align with default --outFilterIntronStrands RemoveInconsistentStrands + //segGood = segGood && (align.intronMotifs[1]==0 || align.intronMotifs[2]==0); //consistent intron motifs. + //this is not requiered since seg2 is tested for length + // segGood = segGood && (align.exons[align.nExons-1][EX_R] + align.exons[align.nExons-1][EX_L] + P.pCh.segmentMin <= Lread + // || align.exons[0][EX_R] >= P.pCh.segmentMin); //uncovered by seg1 read length is <= segmentMin + +}; diff --git a/star-sys/STAR/source/ChimericSegment.h b/star-sys/STAR/source/ChimericSegment.h new file mode 100644 index 0000000..2e2f664 --- /dev/null +++ b/star-sys/STAR/source/ChimericSegment.h @@ -0,0 +1,23 @@ +#ifndef CODE_ChimericSegment +#define CODE_ChimericSegment + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ParametersChimeric.h" + +class ChimericSegment +{// + public: + const Parameters &P; + const ParametersChimeric &pCh; + + Transcript &align; //alignment + uint roS,roE,str; //start/end/strand in original read coordinates + + ChimericSegment(const Parameters &Pin, Transcript &alignIn); //allocate + bool segmentCheck();//check if chimeric segment is good + private: +}; + +#endif diff --git a/star-sys/STAR/source/ChimericTranscript.h b/star-sys/STAR/source/ChimericTranscript.h new file mode 100644 index 0000000..e569e52 --- /dev/null +++ b/star-sys/STAR/source/ChimericTranscript.h @@ -0,0 +1,19 @@ +#ifndef CODE_ChimericTranscript +#define CODE_ChimericTranscript + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" + +class ChimericTranscript +{// + public: + Transcript **chTrs; //all chimeric transcripts + uint nCh; //number of recorded (best) chimeric transcripts + uint nChSize; //size of the chTrs array, will be increased if nCh > nChSize + + ChimericTranscript(Parameters &Pin); //allocate + private: +}; + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/ErrorWarning.cpp b/star-sys/STAR/source/ErrorWarning.cpp new file mode 100644 index 0000000..b3627ac --- /dev/null +++ b/star-sys/STAR/source/ErrorWarning.cpp @@ -0,0 +1,29 @@ +/* + functions that handle errors and warnings +*/ +#include "ErrorWarning.h" +#include "TimeFunctions.h" + +void exitWithError(string messageOut, ostream &streamOut1, ostream &streamOut2, int errorInt, const Parameters &/*P*/) { + time_t timeCurrent; + time( &timeCurrent); + if (streamOut1.good()) { + streamOut1 << "\n" << messageOut << endl << timeMonthDayTime(timeCurrent) <<" ...... FATAL ERROR, exiting\n" <1) pthread_mutex_unlock(&g_threadChunks.mutexError); + exit(errorInt); +}; + +void warningMessage(string messageOut, ostream &streamOut1, ostream &streamOut2, const Parameters &/*P*/) { + time_t timeCurrent; + time( &timeCurrent); + if (streamOut1.good()) { + streamOut1 << "WARNING: " << messageOut << " : " << timeMonthDayTime(timeCurrent) < +#include +#include +#include + +//addresses with respect to shmStart of several genome values +#define SHM_sizeG 0 +#define SHM_sizeSA 8 +#define SHM_startG 16 +// #define SHM_startSA 24 +// +// //first available byt of the shm +// #define SHM_startSHM 32 + + +//arbitrary number for ftok function +#define SHM_projectID 23 + +Genome::Genome (Parameters &Pin ): pGe(Pin.pGe), P(Pin), shmStart(NULL) { + shmKey=ftok(pGe.gDir.c_str(),SHM_projectID); + + sjdbOverhang = pGe.sjdbOverhang; //will be re-defined later if another value was used for the generated genome + sjdbLength = pGe.sjdbOverhang==0 ? 0 : pGe.sjdbOverhang*2+1; +}; + +Genome::~Genome() +{ + freeMemory(); +} + +void Genome::freeMemory(){//free big chunks of memory used by genome and suffix array + + if (pGe.gLoad=="NoSharedMemory") {//can deallocate only for non-shared memory + if (G1 != NULL) delete[] G1; + G1=NULL; + SA.deallocateArray(); + SApass2.deallocateArray(); + SAi.deallocateArray(); + }; +}; + +uint Genome::OpenStream(string name, ifstream & stream, uint size) +{ + stream.open((pGe.gDir+ "/" +name).c_str(), ios::binary); + if (!stream.good()) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: could not open genome file: "<< pGe.gDir << "/" << name <<"\n"; + errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permissions\n" <logMain, EXIT_CODE_GENOME_FILES, P); + }; + + + if (size>0) { + P.inOut->logMain << name << ": size given as a parameter = " << size <<"\n"; + } else { + P.inOut->logMain << "Checking " << name << " size"; + stream.seekg (0, ios::end); + int64 size1 = stream.tellg(); + if (size1<=0) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: failed reading from genome file: "<< pGe.gDir << "/" << name <<"\n"; + errOut << "SOLUTION: re-generate the genome index\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, 1, P); + }; + size=(uint) size1; + stream.clear(); + stream.seekg (0, ios::beg); + P.inOut->logMain << "file size: "<< size <<" bytes; state: good=" <logStdOut) << timeMonthDayTime(rawtime) << " ..... loading genome\n" <logMain << "Reading genome generation parameters:\n"; + + //read genome internal parameters + while (parFile.good()) { + string word1; + parFile >> word1; + if (word1=="###") { + parFile >> word1; + if (word1=="GstrandBit") { + uint gsb1=0; + parFile >> gsb1; + GstrandBit=(uint8) gsb1; + P.inOut->logMain << "### GstrandBit=" << (uint) GstrandBit <<"\n"; + } else { + P.inOut->logMain << "### " <logMain <logMain, EXIT_CODE_GENOME_FILES, P); + }; + + //check genome version + if (P1.versionGenome.size()==0) {// + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: read no value for the versionGenome parameter from genomeParameters.txt file\n"; + errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + } else if (P1.versionGenome == P.versionGenome || P1.versionGenome == "20201") {// + P.inOut->logMain << "Genome version is compatible with current STAR\n"; + } else { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: Genome version: " << P1.versionGenome << " is INCOMPATIBLE with running STAR version: "<< STAR_VERSION <<"\n"; + errOut << "SOLUTION: please re-generate genome from scratch with running version of STAR, or with version: " << P.versionGenome <<"\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + }; + + //find chr starts from files + chrInfoLoad(); + + //check if sjdbInfo.txt exists => genome was generated with junctions + bool sjdbInfoExists=false; + struct stat sjdb1; + if ( stat( (pGe.gDir+"/sjdbInfo.txt").c_str(), &sjdb1) == 0 ) + {//file exists + sjdbInfoExists=true; + }; + + if ( P.sjdbInsert.yes && sjdbInfoExists && P1.pGe.sjdbInsertSave=="") + {//if sjdbInsert, and genome had junctions, and genome is old - it should be re-generated with new STAR + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: old Genome is INCOMPATIBLE with on the fly junction insertion\n"; + errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + }; + + //record required genome parameters in P + pGe.gSAindexNbases=P1.pGe.gSAindexNbases; + pGe.gChrBinNbits=P1.pGe.gChrBinNbits; + genomeChrBinNbases=1LLU<0) + {//genomeFileSize was recorded in the genomeParameters file, copy the values to P + pGe.gFileSizes = P1.pGe.gFileSizes; + }; + + if (P.parArray.at(pGe.sjdbOverhang_par)->inputLevel==0 && P1.pGe.sjdbOverhang>0) + {//if --sjdbOverhang was not defined by user and it was defined >0 at the genome generation step, then use pGe.sjdbOverhang from the genome generation step + pGe.sjdbOverhang=P1.pGe.sjdbOverhang; + P.inOut->logMain << "--sjdbOverhang = " << pGe.sjdbOverhang << " taken from the generated genome\n"; + } else if (sjdbInfoExists && P.parArray.at(pGe.sjdbOverhang_par)->inputLevel>0 && pGe.sjdbOverhang!=P1.pGe.sjdbOverhang) + {//if pGe.sjdbOverhang was defined at the genome generation step,the mapping step value has to agree with it + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: present --sjdbOverhang="<logMain, EXIT_CODE_GENOME_FILES, P); + }; + + sjdbOverhang = pGe.sjdbOverhang; + sjdbLength = pGe.sjdbOverhang==0 ? 0 : pGe.sjdbOverhang*2+1; + + P.inOut->logMain << "Started loading the genome: " << asctime (localtime ( &rawtime ))<<"\n"<logMain << "Read from SAindex: pGe.gSAindexNbases=" << pGe.gSAindexNbases <<" nSAi="<< nSAi <logMain << "nGenome=" << nGenome << "; nSAbyte=" << nSAbyte <logMain <<"GstrandBit="<logMain, EXIT_CODE_MEMORY_ALLOCATION, P); + + } else if (pGe.gLoad=="NoSharedMemory") {// simply allocate memory, do not use shared memory + genomeInsertL=0; + genomeInsertChrIndFirst=nChrReal; + if (pGe.gFastaFiles.at(0)!="-") + {//will insert sequences in the genome, now estimate the extra size + uint oldlen=chrStart.back();//record the old length + genomeInsertL=genomeScanFastaFiles(P, G, false, *this)-oldlen; + }; + + try { + + if (P.sjdbInsert.pass1 || P.sjdbInsert.pass2) + {//reserve extra memory for insertion at the 1st and/or 2nd step + nGenomeInsert=nGenome+genomeInsertL; + nSAinsert=nSA+2*genomeInsertL; + + nGenomePass1=nGenomeInsert; + nSApass1=nSAinsert; + if (P.sjdbInsert.pass1) + { + nGenomePass1+=P.limitSjdbInsertNsj*sjdbLength; + nSApass1+=2*P.limitSjdbInsertNsj*sjdbLength; + }; + + nGenomePass2=nGenomePass1; + nSApass2=nSApass1; + if (P.sjdbInsert.pass2) + { + nGenomePass2+=P.limitSjdbInsertNsj*sjdbLength; + nSApass2+=2*P.limitSjdbInsertNsj*sjdbLength; + }; + + G1=new char[nGenomePass2+L+L]; + + SApass2.defineBits(GstrandBit+1,nSApass2); + SApass2.allocateArray(); + + SApass1.defineBits(GstrandBit+1,nSApass1); + SApass1.pointArray(SApass2.charArray+SApass2.lengthByte-SApass1.lengthByte); + + SAinsert.defineBits(GstrandBit+1,nSAinsert); + SAinsert.pointArray(SApass1.charArray+SApass1.lengthByte-SAinsert.lengthByte); + + SA.pointArray(SAinsert.charArray+SAinsert.lengthByte-SA.lengthByte); + } else + {//no sjdb insertions + if (genomeInsertL==0) + {// no sequence insertion, simple allocation + G1=new char[nGenome+L+L]; + SA.allocateArray(); + } else + { + G1=new char[nGenome+L+L+genomeInsertL]; + SAinsert.defineBits(GstrandBit+1,nSA+2*genomeInsertL);//TODO: re-define GstrandBit if necessary + SAinsert.allocateArray(); + SA.pointArray(SAinsert.charArray+SAinsert.lengthByte-SA.lengthByte); + }; + }; + SAi.allocateArray(); + P.inOut->logMain <<"Shared memory is not used for genomes. Allocated a private copy of the genome.\n"<logMain, EXIT_CODE_MEMORY_ALLOCATION, P); + }; + + } + + +// if (twopass1readsN==0) {//not 2-pass +// shmStartG=SHM_startSHM; +// shmStartSA=0; +// } else {//2-pass +// ostringstream errOut; +// errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory' "\n" ; +// errOut << "SOLUTION: check shared memory settings as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <logMain, EXIT_CODE_SHM, P); +// }; + + + G=G1+L; + + if (pGe.gLoad=="NoSharedMemory") {//load genome and SAs from files + //load genome + P.inOut->logMain <<"Genome file size: "<logMain <<"Loading Genome ... " << flush; + uint genomeReadBytesN=fstreamReadBig(GenomeIn,G,nGenome); + P.inOut->logMain <<"done! state: good=" <logMain <<"SA file size: "<logMain <<"Loading SA ... " << flush; + genomeReadBytesN=fstreamReadBig(SAin,SA.charArray, SA.lengthByte); + P.inOut->logMain <<"done! state: good=" <logMain <<"Loading SAindex ... " << flush; + SAiInBytes +=fstreamReadBig(SAiIn,SAi.charArray, SAi.lengthByte); + P.inOut->logMain <<"done: "<logMain << "Finished loading the genome: " << asctime (localtime ( &rawtime )) <<"\n"<logMain << "Sum of all Genome bytes: " <logMain << "Sum of all SA bytes: " <logMain << "Sum of all SAi bytes: " <logMain << "pGe.gLoad=LoadAndExit: completed, the genome is loaded and kept in RAM, EXITING now.\n"<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + + sjdbInfo >> sjdbN >> pGe.sjdbOverhang; + P.inOut->logMain << "Processing splice junctions database sjdbN=" <> sjdbStart[ii] >> sjdbEnd[ii] >> d1 >> d2 >> d3 >> d4; + sjdbMotif[ii] = (uint8) d1; + sjdbShiftLeft[ii] = (uint8) d2; + sjdbShiftRight[ii] = (uint8) d3; + sjdbStrand[ii] = (uint8) d4; + }; + sjDstart[ii] = sjdbStart[ii] - pGe.sjdbOverhang; + sjAstart[ii] = sjdbEnd[ii] + 1; + if (sjdbMotif[ii]==0) {//shinon-canonical junctions back to their true coordinates + sjDstart[ii] += sjdbShiftLeft[ii]; + sjAstart[ii] += sjdbShiftLeft[ii]; + }; + }; + }; + + //check and redefine some parameters + //max intron size + if (P.alignIntronMax==0 && P.alignMatesGapMax==0) { + P.inOut->logMain << "alignIntronMax=alignMatesGapMax=0, the max intron size will be approximately determined by (2^winBinNbits)*winAnchorDistNbins=" \ + << (1LLU<0 but alignMatesGapMax==0, winBinNbits will be defined by alignIntronMax + P.inOut->logMain << "To accommodate alignIntronMax="< " << "pGe.gChrBinNbits=" << pGe.gChrBinNbits << " redefining:\n"; + P.winBinNbits=pGe.gChrBinNbits; + P.inOut->logMain << "winBinNbits=" <logMain << "To accommodate alignIntronMax="<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + char chrInChar[1000]; + + while (chrStreamIn.good()) { + string chrIn; + chrStreamIn.getline(chrInChar,1000); + chrIn=chrInChar; + if (chrIn=="") break; + chrName.push_back(chrIn); + }; + chrStreamIn.close(); + nChrReal=chrName.size(); + + P.inOut->logMain << "Number of real (reference) chromosomes= " << nChrReal <<"\n"<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + for (uint ii=0;ii> chrLength[ii]; + }; + chrStreamIn.close(); + + //load chr starts + chrStreamIn.open( (pGe.gDir+"/chrStart.txt").c_str() ); + if (chrStreamIn.fail()) { + ostringstream errOut; + errOut << "EXITING because of FATAL error, could not open file " << (pGe.gDir+"/chrStart.txt") <<"\n"; + errOut << "SOLUTION: re-generate genome files with STAR --runMode genomeGenerate\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + + for (uint ii=0;ii<=nChrReal;ii++) { + chrStreamIn >> chrStart[ii]; + }; + chrStreamIn.close(); + + //log + for (uint ii=0; iilogMain << ii+1 <<"\t"<< chrName[ii] <<"\t"<=chrStart[ichr]) ichr++; + chrBin[ii]=ichr-1; + }; +}; diff --git a/star-sys/STAR/source/Genome.h b/star-sys/STAR/source/Genome.h new file mode 100644 index 0000000..d5deaf2 --- /dev/null +++ b/star-sys/STAR/source/Genome.h @@ -0,0 +1,66 @@ +#ifndef GENOME_DEF +#define GENOME_DEF + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "PackedArray.h" +#include "Variation.h" + +class Genome { + public: + char *G, *sigG; + PackedArray SA,SAinsert,SApass1,SApass2; + PackedArray SAi; + Variation *Var; + + uint nGenomeInsert, nGenomePass1, nGenomePass2, nSAinsert, nSApass1, nSApass2; + + ParametersGenome &pGe; + + //chr parameters + vector chrStart, chrLength, chrLengthAll; + uint genomeChrBinNbases, chrBinN, *chrBin; + vector chrName, chrNameAll; + map chrNameIndex; + + uint *genomeSAindexStart;//starts of the L-mer indices in the SAindex, 1<=L<=pGe.gSAindexNbases + + uint nGenome, nSA, nSAbyte, nChrReal;//genome length, SA length, # of chromosomes, vector of chromosome start loci + uint nGenome2, nSA2, nSAbyte2, nChrReal2; //same for the 2nd pass + uint nSAi; //size of the SAindex + unsigned char GstrandBit, SAiMarkNbit, SAiMarkAbsentBit; //SA index bit for strand information + uint GstrandMask, SAiMarkAbsentMask, SAiMarkAbsentMaskC, SAiMarkNmask, SAiMarkNmaskC;//maske to remove strand bit from SA index, to remove mark from SAi index + + //SJ database parameters + uint sjdbOverhang, sjdbLength; //length of the donor/acceptor, length of the sj "chromosome" =2*pGe.sjdbOverhang+1 including spacer + uint sjChrStart,sjdbN; //first sj-db chr + uint sjGstart; //start of the sj-db genome sequence + uint *sjDstart,*sjAstart,*sjStr, *sjdbStart, *sjdbEnd; //sjdb loci + uint8 *sjdbMotif; //motifs of annotated junctions + uint8 *sjdbShiftLeft, *sjdbShiftRight; //shifts of junctions + uint8 *sjdbStrand; //junctions strand, not used yet + + //sequence insert parameters + uint genomeInsertL; //total length of the sequence to be inserted on the fly + uint genomeInsertChrIndFirst; //index of the first inserted chromosome + + Genome (Parameters &Pin ); + ~Genome(); + + void freeMemory(); + void genomeLoad(); + void chrBinFill(); + void chrInfoLoad(); + + void insertSequences(); + + void genomeGenerate(); + + private: + Parameters &P; + key_t shmKey; + char *shmStart; + char *G1; //pointer -200 of G + uint OpenStream(string name, ifstream & stream, uint size); +}; +#endif diff --git a/star-sys/STAR/source/Genome_genomeGenerate.cpp b/star-sys/STAR/source/Genome_genomeGenerate.cpp new file mode 100644 index 0000000..1af3a8b --- /dev/null +++ b/star-sys/STAR/source/Genome_genomeGenerate.cpp @@ -0,0 +1,507 @@ +#include + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "SuffixArrayFuns.h" +#include "PackedArray.h" +#include "TimeFunctions.h" +#include "ErrorWarning.h" +#include "loadGTF.h" +#include "SjdbClass.h" +#include "sjdbLoadFromFiles.h" +#include "sjdbPrepare.h" +#include "genomeParametersWrite.h" +#include "sjdbInsertJunctions.h" +#include "genomeScanFastaFiles.h" +#include "genomeSAindex.h" + +#include "serviceFuns.cpp" +#include "streamFuns.h" +#include "SequenceFuns.h" + + +char* globalG; +uint globalL; + + +inline int funCompareSuffixes ( const void *a, const void *b){ + + uint *ga=(uint*)((globalG-7LLU)+(*((uint*)a))); + uint *gb=(uint*)((globalG-7LLU)+(*((uint*)b))); + + uint jj=0; + int ii=0; + uint va=0,vb=0; + uint8 *va1, *vb1; + + while (jj < globalL) { + va=*(ga-jj); + vb=*(gb-jj); + + #define has5(v) ((((v)^0x0505050505050505) - 0x0101010101010101) & ~((v)^0x0505050505050505) & 0x8080808080808080) + + if (has5(va) && has5(vb)) + {//there is 5 in the sequence - only compare bytes before 5 + va1=(uint8*) &va; + vb1=(uint8*) &vb; + for (ii=7;ii>=0;ii--) + { + if (va1[ii]>vb1[ii]) + { + return 1; + } else if (va1[ii] *((uint*)b) ) + {//anti-stable order,since indexes are sorted in the reverse order + return -1; + } else + {//a cannot be equal to b + return 1; + }; + }; + }; + } else + {//no 5, simple comparison + if (va>vb) + { + return 1; + } else if (va *((uint*)b) ) + {//anti-stable order,since indexes are sorted in the reverse order + return -1; + } else + {//a cannot be equal to b + return 1; + }; +}; + +// inline bool funCompareSuffixesBool ( const void *a, const void *b) +// { +// uint jj=0LLU; +// +// uint *ga=(uint*)((globalG-7LLU)+(*((uint*)a))); +// uint *gb=(uint*)((globalG-7LLU)+(*((uint*)b))); +// uint va=0,vb=0; +// +// while (va==vb && jj>GstrandBit) == 0; + SAstr &= GstrandMask; + if ( !strandG ) SAstr += N; + return SAstr; +}; + +void Genome::genomeGenerate() { + + //check parameters + if (sjdbOverhang<=0 && (pGe.sjdbFileChrStartEnd.at(0)!="-" || pGe.sjdbGTFfile!="-")) + { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; + errOut << "you need to specify >0 --sjdbOverhang\n"; + errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + } + if (pGe.sjdbFileChrStartEnd.at(0)=="-" && pGe.sjdbGTFfile=="-") + { + if (P.parArray.at(P.pGe.sjdbOverhang_par)->inputLevel>0 && sjdbOverhang>0) + { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; + errOut << "do not specify >0 --sjdbOverhang\n"; + errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + sjdbOverhang=0; + }; + + //time + time_t rawTime; + string timeString; + + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting to generate Genome files\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... starting to generate Genome files\n" < chrStartMap; + for (uint ii=0;ii (chrName[ii], chrStart[ii])); + }; + + uint nInserted=0, nWrongChr=0, nWrongRef=0, nRefN=0; + while (consIn.good()) { + string chr1, refIn, altIn, dummy; + uint start1; + char ref1,alt1; + + consIn >> chr1 >> start1 >> dummy >> refIn >> altIn; + consIn.ignore(numeric_limits::max(),'\n'); + + convertNucleotidesToNumbers(refIn.c_str(),&ref1,1); + convertNucleotidesToNumbers(altIn.c_str(),&alt1,1); + --start1;//VCF positions are 1-based + + if (chrStartMap.count(chr1)==1) {//otherwise just skip + start1+=chrStartMap[chr1]; + if (G[start1]>3) + ++nRefN; + + if (G[start1]==ref1 || G[start1]>3) { + G[start1]=alt1; + ++nInserted; + } else { + ++nWrongRef; + P.inOut->logMain << "WARNING: reference allele in consensus file does not agree with reference genome base: "; + P.inOut->logMain << chr1 <<" "<< start1-chrStartMap[chr1] <<" "<< (int) G[start1]<<" "<< (int) ref1<<" "<< (int) alt1<<"\n"; + }; + } else { + ++nWrongChr; + }; + }; + P.inOut->logMain <<"Inserted consensus variants: " << nInserted <<", including reference N-base:"<< nRefN <<", wrong chromosome: " << nWrongChr<< ", wrong reference base: " << nWrongRef << endl; + }; + + uint N = nGenomeReal; + nGenome=N; + uint N2 = N*2; + + if (pGe.gSAindexNbases > log2(nGenome)/2-1) { + ostringstream warnOut; + warnOut << "--genomeSAindexNbases " << pGe.gSAindexNbases << " is too large for the genome size=" << nGenome; + warnOut << ", which may cause seg-fault at the mapping step. Re-run genome generation with recommended --genomeSAindexNbases " << int(log2(nGenome)/2-1); + warningMessage(warnOut.str(),P.inOut->logMain,std::cerr,P); + }; + + ofstream & chrN = ofstrOpen(pGe.gDir+"/chrName.txt",ERROR_OUT, P); + ofstream & chrS = ofstrOpen(pGe.gDir+"/chrStart.txt",ERROR_OUT, P); + ofstream & chrL = ofstrOpen(pGe.gDir+"/chrLength.txt",ERROR_OUT, P); + ofstream & chrNL = ofstrOpen(pGe.gDir+"/chrNameLength.txt",ERROR_OUT, P); + + for (uint ii=0;iilogMain, EXIT_CODE_INPUT_FILES, P); + }; + + //preparing to generate SA + for (uint ii=0;iilogMain <<"Estimated genome size="<logMain << "GstrandBit=" << int(GstrandBit) <<"\n"; + + GstrandMask = ~(1LLU<logMain << "Number of SA indices: "<< nSA << "\n"<logMain << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <1) saChunkSize=min(saChunkSize,nSA/(P.runThreadN-1)); + + uint saChunkN=nSA/saChunkSize;//estimate + uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case + uint* indPrefChunkCount = new uint [saChunkN*2]; + indPrefStart[0]=0; + saChunkN=0;//start counting chunks + uint chunkSize1=indPrefCount[0]; + for (uint ii=1; ii saChunkSize) { + saChunkN++; + indPrefStart[saChunkN]=ii; + indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii]; + chunkSize1=indPrefCount[ii]; + }; + }; + saChunkN++; + indPrefStart[saChunkN]=indPrefN+1; + indPrefChunkCount[saChunkN-1]=chunkSize1; + + P.inOut->logMain << "Number of chunks: " << saChunkN <<"; chunks size limit: " << saChunkSize*8 <<" bytes\n" <logMain << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <=indPrefStart[iChunk] && p1logMain << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <logMain, EXIT_CODE_INPUT_FILES, P); + }; + + //DONE with suffix array generation + + for (uint ii=0;iilogMain << timeMonthDayTime(rawTime) <<" ... finished generating suffix array\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... finished generating suffix array\n" <logMain << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <logMain << "SA size in bytes: "<logMain << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <logMain << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <logMain << timeMonthDayTime(rawTime) << " ..... finished successfully\n" <logStdOut << timeMonthDayTime(rawTime) << " ..... finished successfully\n" <logMain << timeMonthDayTime(rawtime) << " ..... inserting extra sequences into genome indexes" <flush(); + if (outSAM!=NULL) outSAM->flush(); + + logStdOutFile.flush(); + outSAMfile.flush(); + + outChimSAM.flush(); + outChimJunction.flush(); + logProgress.flush(); + logMain.flush(); + logFinal.flush(); + outLocalChains.flush(); + + //logStdOutFile.close(); //do not want to close these log files, as some destructors (e.g. ~SharedMemory) might still write there + //logMain.close(); + + outSAMfile.close(); + outChimSAM.close(); + outChimJunction.close(); + logProgress.close(); + logFinal.close(); + outLocalChains.close(); + + + for (int ii=0;ii<2;ii++) { + if (outUnmappedReadsStream[ii].is_open()) { + outUnmappedReadsStream[ii].flush(); + outUnmappedReadsStream[ii].close(); + } + }; +}; + diff --git a/star-sys/STAR/source/InOutStreams.h b/star-sys/STAR/source/InOutStreams.h new file mode 100644 index 0000000..5d3e845 --- /dev/null +++ b/star-sys/STAR/source/InOutStreams.h @@ -0,0 +1,49 @@ +#ifndef INOUTSTREAMS_DEF +#define INOUTSTREAMS_DEF + +#include +#include +#include "IncludeDefine.h" +#include SAMTOOLS_BGZF_H + +template > +class NullBuf: public std::basic_streambuf { + inline typename traits::int_type overflow(typename traits::int_type c) { + return traits::not_eof(c); + } +}; + +template > +class NullStream: public std::basic_ostream { + public: + inline NullStream(): + std::basic_ios(&nullbuf), + std::basic_ostream(&nullbuf) + { std::basic_ios::init(&nullbuf); } + inline void open(const char*, std::ios_base::openmode = std::ios_base::out) {} + inline void close() {} + + private: + NullBuf nullbuf; +}; + +class InOutStreams { + public: + ostream *logStdOut, *outSAM; + ofstream outSAMfile; + BGZF *outBAMfileUnsorted, *outBAMfileCoord, *outQuantBAMfile; + + ofstream outChimSAM, outChimJunction, logFinal, outUnmappedReadsStream[MAX_N_MATES]; + ifstream readIn[MAX_N_MATES]; + + //send logs to nothing + NullStream<> logStdOutFile, logMain, logProgress; + + //compilation-optional streams + ofstream outLocalChains; + + InOutStreams(); + ~InOutStreams(); +}; + +#endif diff --git a/star-sys/STAR/source/IncludeDefine.h b/star-sys/STAR/source/IncludeDefine.h new file mode 100644 index 0000000..25a7f0f --- /dev/null +++ b/star-sys/STAR/source/IncludeDefine.h @@ -0,0 +1,240 @@ +#ifndef INCLUDEDEFINE_DEF +#define INCLUDEDEFINE_DEF + +//standard libs +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "VERSION" + +#define ERROR_OUT string ( __FILE__ ) +":"+ to_string ( (uint) __LINE__ ) +":"+ string ( __FUNCTION__ ) + +//external libs +#define SAMTOOLS_BGZF_H "htslib/htslib/bgzf.h" +#define SAMTOOLS_SAM_H "htslib/htslib/sam.h" + +using namespace std; + +#ifdef COMPILE_FOR_MAC + //some Mac's idiosyncrasies: standard SHM libraries are very old and missing some definitions + #define SHM_NORESERVE 0 +#endif + +#if defined(__mips__) && !defined(SHM_NORESERVE) +#define SHM_NORESERVE 010000 +#endif + +typedef int8_t int8; +typedef uint8_t uint8; + +#define uint unsigned long long +#define sint signed long long +#define uint64 unsigned long long +#define uint32 unsigned int +#define uint16 unsigned short int +#define uchar unsigned char +#define int64 long long +#define int32 int + +// this is gcc extension, may need to redefine for other compilers +#define uint128 __uint128_t + +#define GENOME_spacingChar 5 + +#define uintWinBin unsigned short +#define uintWinBinMax numeric_limits::max() + + +#define intSWscore int +#define intScore int + +#define scoreMatch 1 + + +//cleaned +//output +#define BAMoutput_oneAlignMaxBytes 100000 + + +//SAM attributes +#define ATTR_NH 1 +#define ATTR_HI 2 +#define ATTR_AS 3 +#define ATTR_NM 4 +#define ATTR_MD 5 +#define ATTR_nM 6 +#define ATTR_jM 7 +#define ATTR_jI 8 +#define ATTR_XS 9 +#define ATTR_RG 10 +#define ATTR_vG 11 +#define ATTR_vA 12 +#define ATTR_vW 13 +#define ATTR_ch 14 +#define ATTR_MC 15 +#define ATTR_rB 16 +#define ATTR_CR 17 +#define ATTR_CY 18 +#define ATTR_UR 19 +#define ATTR_UY 20 + +//BAM definitions +#define BAM_CIGAR_MaxSize 10000 +#define BAM_CIGAR_OperationShift 4 +#define BAM_CIGAR_M 0 +#define BAM_CIGAR_I 1 +#define BAM_CIGAR_D 2 +#define BAM_CIGAR_N 3 +#define BAM_CIGAR_S 4 +#define BAM_CIGAR_H 5 +#define BAM_CIGAR_P 6 +#define BAM_CIGAR_EQ 7 +#define BAM_CIGAR_X 8 + + +#if defined COMPILE_FOR_LONG_READS + #define MAX_N_EXONS 1000 + #define BAM_ATTR_MaxSize 10000 +#else + #define MAX_N_EXONS 20 + #define BAM_ATTR_MaxSize 1000 +#endif + +//input reads +#define MAX_N_MATES 2 +#define DEF_readNameLengthMax 50000 +#if defined COMPILE_FOR_LONG_READS + #define DEF_readSeqLengthMax 500000 +#else + #define DEF_readSeqLengthMax 650 +#endif + +#if (DEF_readNameLengthMax > DEF_readSeqLengthMax) + #define DEF_readNameSeqLengthMax DEF_readNameLengthMax +#else + #define DEF_readNameSeqLengthMax DEF_readSeqLengthMax +#endif + +#define EXIT_CODE_BUG 101 +#define EXIT_CODE_PARAMETER 102 +#define EXIT_CODE_RUNTIME 103 +#define EXIT_CODE_INPUT_FILES 104 +#define EXIT_CODE_GENOME_FILES 105 +#define EXIT_CODE_SHM 106 +#define EXIT_CODE_GENOME_LOADING_WAITED_TOO_LONG 107 +#define EXIT_CODE_MEMORY_ALLOCATION 108 +#define EXIT_CODE_FILE_OPEN 109 +#define EXIT_CODE_FILE_WRITE 110 +#define EXIT_CODE_INCONSISTENT_DATA 111 + +//cleaned-end + + +//exit codes +#define EXIT_createExtendWindowsWithAlign_TOO_MANY_WINDOWS 101 + +#define SJ_MOTIF_SIZE 7 //number of recorded SJ motifs +#define SJ_SAM_AnnotatedMotifShift 20 + +#define EXTEND_ORDER 1 //1-first extend to the 5' of the read, then 3'; 2- first extend to the left, then to the right + +#define MAX_N_FRAG 2 +#define MARK_FRAG_SPACER_BASE 11 +#define MAX_N_CHIMERAS 5 +#define MAX_N_MULTMAP 100000 //max number of multiple mappers +#define MAX_SJ_REPEAT_SEARCH 255 //max length of a repeat to search around a SJ +#define MAX_QS_VALUE 60 +#define MAX_OUTPUT_FLAG 10 + +#define PC_rStart 0 +#define PC_Length 1 +#define PC_Str 2 +#define PC_Dir 3 +#define PC_Nrep 4 +#define PC_SAstart 5 +#define PC_SAend 6 +#define PC_iFrag 7 +#define PC_SIZE 8 + +#define WC_Str 0 +#define WC_Chr 1 +#define WC_gStart 2 +#define WC_gEnd 3 +#define WC_SIZE 4 + +#define WA_Length 0 +#define WA_rStart 1 +#define WA_gStart 2 +#define WA_Nrep 3 +#define WA_Anchor 4 +#define WA_iFrag 5 +#define WA_sjA 6 +#define WA_SIZE 7 + +#define EX_R 0 +#define EX_G 1 +#define EX_L 2 +#define EX_iFrag 3 +#define EX_sjA 4 +#define EX_SIZE 5 + +//mapType +#define MT_PE 0 //paired end type +#define MT_SIZE 5 + +#define MARKER_ALL_PIECES_EXCEED_seedMultimapNmax 999901 //marks the reads that map too many time, more than seedMultimapNmax +#define MARKER_NO_UNIQUE_PIECES 999902 //the best transcript does not contain any unique pieces +#define MARKER_NO_GOOD_WINDOW 999903 //did not find any good windows +#define MARKER_NO_GOOD_PIECES 999904 +#define MARKER_TOO_MANY_ANCHORS_PER_WINDOW 999905 +#define MARKER_MAX_N_MULT_EXCEEDED 999906 +#define MARKER_FULL_LENGTH_MULTIMAPPER_EXCEEDED_alignWindowsPerReadNmax 999907 +#define MARKER_ALL_PIECES_EXCEEDED_winAnchorMultimapNmax 999908 +#define MARKER_TOO_MANY_CHIMERAS 999909 +#define MARKER_READ_TOO_SHORT 999910 + +#define PEMARKER_SINGLE_END 0 +#define PEMARKER_PAIR 1 +#define PEMARKER_ONE_END 3 +#define PEMARKER_TOO_MANY_PAIRS 5 +#define PEMARKER_CHIMERIC_PAIRS 7 +#define PEMARKER_CHIMERIC_SJ_READ1 221 +#define PEMARKER_CHIMERIC_SJ_READ2 223 +#define PEMARKER_CHIMERIC_SJ_READ1and2 225 +#define PEMARKER_SINGLE_END_NOTMAPPED 1001 + + +typedef uint uiPC[PC_SIZE]; +typedef uint uiWC[WC_SIZE]; +typedef uint uiWA[WA_SIZE]; + +// debugging +//#define DEBUG_Nread 1000000 +//#define DEBUG +#if defined DEBUG + #define DEBUG_stitch + #define DEBUG_Nread 200000 + #define DEBUG_NreadStart 1 + #define DEBUG_extend +#endif + +// #define DEBUG_NreadStart 500000 + +#endif diff --git a/star-sys/STAR/source/Makefile b/star-sys/STAR/source/Makefile new file mode 100644 index 0000000..7dc663a --- /dev/null +++ b/star-sys/STAR/source/Makefile @@ -0,0 +1,85 @@ +# user may define these whole flags +# CPPFLAGS +# CXXFLAGS +UNAME := $(shell uname) + +# or these user-set flags that will be added to standard flags +CXXFLAGSextra ?= + +# user may define the compiler +CXX ?= + +# pre-defined flags + +COMPTIMEPLACE := -D'COMPILATION_TIME_PLACE="source/Makefile"' + +ifeq ($(UNAME), Darwin) + CXXFLAGS_extra := -D'COMPILE_FOR_MAC' +else + CXXFLAGS_extra := +endif +CXXFLAGS_common := -pipe -std=c++11 -Wall -Wextra -Werror -fPIC $(CXXFLAGS_extra) $(COMPTIMEPLACE) + +CXXFLAGS_main := -O3 -g $(CXXFLAGS_common) +CXXFLAGS_gdb := -O0 -g $(CXXFLAGS_common) + + +########################################################################################################## + +OBJECTS = orbit.o \ + InOutStreams.o \ + Parameters.o \ + ParametersSolo.o \ + ParametersChimeric_initialize.o \ + PackedArray.o \ + SequenceFuns.o \ + Genome.o \ + Genome_insertSequences.o \ + Genome_genomeGenerate.o \ + streamFuns.o \ + genomeScanFastaFiles.o \ + TimeFunctions.o \ + insertSeqSA.o \ + ReadAlign.o \ + Transcript.o \ + Transcriptome_quantAlign.o \ + funCompareUintAndSuffixesMemcmp.o \ + genomeSAindex.o \ + ReadAlign_outputAlignments.o \ + ReadAlign_outputTranscriptSAM.o \ + ReadAlign_quantTranscriptome.o \ + ReadAlign_calcCIGAR.o \ + ReadAlign_storeAligns.o \ + SuffixArrayFuns.o \ + ReadAlign_oneRead.o \ + ReadAlign_mapOneRead.o \ + ReadAlign_stitchPieces.o \ + ReadAlign_mappedFilter.o \ + ReadAlign_maxMappableLength2strands.o \ + ReadAlign_assignAlignToWindow.o \ + ReadAlign_createExtendWindowsWithAlign.o \ + ReadAlign_multMapSelect.o \ + readLoad.o \ + stitchWindowAligns.o \ + extendAlign.o \ + binarySearch2.o \ + blocksOverlap.o \ + stitchAlignToTranscript.o \ + ErrorWarning.o + +%.o : %.cpp + $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< + +all: liborbit.a + +clean: + rm -f *.o *.a + +parametersDefault.xxd: parametersDefault + xxd -i parametersDefault > parametersDefault.xxd + +liborbit.a : CXXFLAGS := $(CXXFLAGSextra) $(CXXFLAGS_main) $(CXXFLAGS) +liborbit.a : $(OBJECTS) + ar -csru $@ $(OBJECTS) + + diff --git a/star-sys/STAR/source/OutSJ.cpp b/star-sys/STAR/source/OutSJ.cpp new file mode 100644 index 0000000..919a9a5 --- /dev/null +++ b/star-sys/STAR/source/OutSJ.cpp @@ -0,0 +1,117 @@ +#include "OutSJ.h" +#include "ErrorWarning.h" + +OutSJ::OutSJ (uint nSJmax, const Parameters &Pin, const Genome &genomeIn) : oneSJ(genomeIn), P(Pin), mapGen(genomeIn) {//do I need P? + + data = new char [oneSJ.dataSize*nSJmax]; //allocate big array of SJ loci and properties + memset(data,0,oneSJ.dataSize*nSJmax); + N=0;//initialize the counter +}; + +OutSJ::~OutSJ() { + delete[] data; +} + +int compareSJ(const void* i1, const void* i2) {//compare SJs from the data structure + uint s1=*( (uint*)i1 ); + uint s2=*( (uint*)i2 ); + + if (s1>s2) { + return 1; + } else if (s1g2) { + return 1; + } else if (g1> mapGen.pGe.gChrBinNbits]; + outStream << mapGen.chrName.at(sjChr) <<"\t"<< *start + 1 - mapGen.chrStart[sjChr] <<"\t"<<*start + *gap - mapGen.chrStart[sjChr] \ + <<"\t"<< int(*strand) <<"\t"<< int(*motif) <<"\t"<< int (*annot) <<"\t"<< *countUnique <<"\t"<< *countMultiple \ + <<"\t"<< *overhangLeft << endl; +}; + +void Junction::collapseOneSJ(char* isj1P, char* isjP, const Parameters& P) {//collapse isj junction into isj1: increase counts in isj1. choose max overhangs, motif, annot + *(uint32*)(isj1P+countUniqueP) += *(uint32*)(isjP+countUniqueP); + *(uint32*)(isj1P+countMultipleP) += *(uint32*)(isjP+countMultipleP); + + if (*(uint16*)(isj1P+overhangLeftP) < *(uint16*)(isjP+overhangLeftP) ) { + *(uint16*)(isj1P+overhangLeftP) = *(uint16*)(isjP+overhangLeftP) ; + }; + if (*(uint16*)(isj1P+overhangRightP) < *(uint16*)(isjP+overhangRightP) ) { + *(uint16*)(isj1P+overhangRightP) = *(uint16*)(isjP+overhangRightP) ; + }; + + if (*(isj1P+motifP) != *(isjP+motifP) ) { + uint s1=*(uint*)(isj1P+startP); + uint c1=mapGen.chrBin[ s1 >> mapGen.pGe.gChrBinNbits]; + + stringstream errOut; + errOut <<"EXITING becaues of BUG: different motifs for the same junction while collapsing junctions\n" \ + << mapGen.chrName[c1] <<" "<< s1-mapGen.chrStart[c1]+1 <<" "<logMain, EXIT_CODE_BUG, P);\ +// *(isj1P+motifP) = *(isjP+motifP) ; + }; + if (*(isj1P+annotP) < *(isjP+annotP) ) { + stringstream errOut; + errOut <<"EXITING becaues of BUG: different annotation status for the same junction while collapsing junctions:"\ + <<*(uint*)(isj1P+startP) <<" "<<*(uint32*)(isj1P+gapP) <<" "<logMain, EXIT_CODE_BUG, P);\ + +// *(isj1P+annotP) = *(isjP+annotP) ; + }; + +} diff --git a/star-sys/STAR/source/OutSJ.h b/star-sys/STAR/source/OutSJ.h new file mode 100644 index 0000000..6806be7 --- /dev/null +++ b/star-sys/STAR/source/OutSJ.h @@ -0,0 +1,57 @@ +#ifndef CODE_OutSJ +#define CODE_OutSJ + +#include "Parameters.h" +#include "Genome.h" + +class Junction {//one junction +public: + const static uint startP=0; + const static uint gapP=startP+sizeof(uint); + const static uint strandP=gapP+sizeof(uint32); + const static uint motifP=strandP+sizeof(char); + const static uint annotP=motifP+sizeof(char); + const static uint countUniqueP=annotP+sizeof(char); + const static uint countMultipleP=countUniqueP+sizeof(uint32); + const static uint overhangLeftP=countMultipleP+sizeof(uint32); + const static uint overhangRightP=overhangLeftP+sizeof(uint16); + + uint *start; + uint32 *gap; + char *strand, *motif, *annot; + uint32 *countUnique, *countMultiple; + uint16 *overhangLeft, *overhangRight; + + const static uint dataSize=overhangRightP+sizeof(uint16); + + Junction(const Genome &genomeIn); + void junctionPointer(char* sjPoint, uint isj); + void outputStream(ostream &outStream); + void collapseOneSJ(char* isj1P, char* isjP, const Parameters& P); + +private: + const Genome &mapGen; +}; + +class OutSJ { + +public: + //all junctions + char* data; //sj array[Njunctions][dataSize] + uint N; //number of junctions stored + Junction oneSJ; + + OutSJ(uint nSJmax, const Parameters &Pin, const Genome &genomeIn); + ~OutSJ(); + void collapseSJ();//collapse the junctions in data +// int compareSJ(void* i1, void* i2); + +private: + const Parameters &P; + const Genome &mapGen; +}; + +int compareSJ(const void* i1, const void* i2); //external functions + +#endif + diff --git a/star-sys/STAR/source/PackedArray.cpp b/star-sys/STAR/source/PackedArray.cpp new file mode 100644 index 0000000..a31616d --- /dev/null +++ b/star-sys/STAR/source/PackedArray.cpp @@ -0,0 +1,43 @@ +# include "PackedArray.h" + +PackedArray::PackedArray() { + charArray=NULL; + arrayAllocated=false; +}; + +void PackedArray::defineBits(uint Nbits, uint lengthIn){ + wordLength=Nbits; + wordCompLength=sizeof(uint)*8LLU-wordLength; + bitRecMask=(~0LLU)>>wordCompLength; + length=lengthIn; + lengthByte=(length-1)*wordLength/8LLU+sizeof(uint); +// lengthByte=((lengthByte+sizeof(uint)-1LLU)/sizeof(uint))*sizeof(uint); +}; + +void PackedArray::writePacked( uint jj, uint x) { + uint b=jj*wordLength; + uint B=b/8LLU; + uint S=b%8LLU; + + x = x << S; + uint* a1 = (uint*) (charArray+B); + *a1 = ( (*a1) & ~(bitRecMask<>S)<>pa.wordCompLength; + return a1; + +}*/ + +inline uint PackedArray::operator [] (uint ii) const { + uint b=ii*wordLength; + uint B=b/8; + uint S=b%8; + + uint a1 = *((uint*) (charArray+B)); + a1 = ((a1>>S)<>wordCompLength; + return a1; +}; + +#endif diff --git a/star-sys/STAR/source/ParameterInfo.h b/star-sys/STAR/source/ParameterInfo.h new file mode 100644 index 0000000..89fd3c9 --- /dev/null +++ b/star-sys/STAR/source/ParameterInfo.h @@ -0,0 +1,120 @@ +#ifndef PARAMETERSINFO_DEF +#define PARAMETERSINFO_DEF + +class ParameterInfoBase { +public: + string nameString; //string that identifies parameter + int inputLevel; //where the parameter was defined + int inputLevelAllowed; //at which inpurt level parameter definition is allowed + virtual void inputValues(istringstream &streamIn) =0; + friend std::ostream& operator<< (std::ostream& o, ParameterInfoBase const& b); + virtual ~ParameterInfoBase() {}; +protected: + virtual void printValues(std::ostream& o) const = 0; +}; + + + +inline std::ostream& operator<< (std::ostream& o, ParameterInfoBase const& b) { + b.printValues(o); + return o; +}; + + +template +inline parameterType inputOneValue (istringstream &streamIn) { + parameterType oneV; + streamIn >> oneV; + return oneV; +}; +template <> +inline string inputOneValue (istringstream &streamIn) { + string oneV=""; + streamIn >> ws;//skip whitespace + if (streamIn.peek()!='"') {//simple parameter with no spaces or " + streamIn >> oneV; + } else { + streamIn.get();//skip " + getline(streamIn,oneV,'"'); + }; + return oneV; +}; +// Make sure -1 gets wrapped to UINT_MAX. It is nessessary to do it this way +// in case of older versions of libc. +template <> +inline uint inputOneValue (istringstream &streamIn) { + int64_t oneV; + streamIn >> oneV; + return static_cast(oneV); +}; + + +template +inline void printOneValue (parameterType *value, std::ostream& outStr) { + outStr << *value; +}; +template <> +inline void printOneValue (string *value, std::ostream& outStr) { + if ((*value).find_first_of(" \t")!=std::string::npos) {//there is white space in the argument, put "" around + outStr << '\"' << *value <<'\"'; + } else { + outStr << *value; + }; +}; + +template +class ParameterInfoScalar : public ParameterInfoBase { +public: + parameterType *value; + vector allowedValues; + + ParameterInfoScalar(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, parameterType* valueIn) { + nameString=nameStringIn; + inputLevel=inputLevelIn; + inputLevelAllowed=inputLevelAllowedIn; + value=valueIn; + }; + + void inputValues(istringstream &streamIn) { + *value=inputOneValue (streamIn); + }; + + ~ParameterInfoScalar() {}; +protected: + virtual void printValues(std::ostream& outStr) const { + printOneValue(value, outStr); + }; + +}; + +template +class ParameterInfoVector : public ParameterInfoBase { +public: + vector *value; + vector allowedValues; + + ParameterInfoVector(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, vector *valueIn) { + nameString=nameStringIn; + inputLevel=inputLevelIn; + inputLevelAllowed=inputLevelAllowedIn; + value=valueIn; + }; + + void inputValues(istringstream &streamIn) { + (*value).clear(); + while (streamIn.good()) { + (*value).push_back(inputOneValue (streamIn)); + streamIn >> ws; //remove white space, may arrive at the end of line + }; + }; + + ~ParameterInfoVector() {}; +protected: + virtual void printValues(std::ostream& outStr) const { + for (int ii=0; ii < (int) (*value).size(); ii++) { + printOneValue(&(*value).at(ii),outStr); + outStr<<" "; + }; + }; +}; +#endif diff --git a/star-sys/STAR/source/Parameters.cpp b/star-sys/STAR/source/Parameters.cpp new file mode 100755 index 0000000..75bea9a --- /dev/null +++ b/star-sys/STAR/source/Parameters.cpp @@ -0,0 +1,1468 @@ +#include +#include "IncludeDefine.h" +#include "Parameters.h" +#include "ErrorWarning.h" +#include "SequenceFuns.h" +#include "OutSJ.h" +#include "sysRemoveDir.h" +#include "stringSubstituteAll.h" +#include SAMTOOLS_BGZF_H +#include "GlobalVariables.h" + +//for mkfifo +#include + +#define PAR_NAME_PRINT_WIDTH 30 + +Parameters::Parameters() {//initalize parameters info + + inOut = new InOutStreams; + + //versions + parArray.push_back(new ParameterInfoScalar (-1, -1, "versionGenome", &versionGenome)); + + //parameters + parArray.push_back(new ParameterInfoVector (-1, 2, "parametersFiles", ¶metersFiles)); + + //system + parArray.push_back(new ParameterInfoScalar (-1, -1, "sysShell", &sysShell)); + + //run + parArray.push_back(new ParameterInfoScalar (-1, -1, "runMode", &runMode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "runThreadN", &runThreadN)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "runDirPerm", &runDirPermIn)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "runRNGseed", &runRNGseed)); + + //genome + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeDir", &pGe.gDir)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeLoad", &pGe.gLoad)); + parArray.push_back(new ParameterInfoVector (-1, -1, "genomeFastaFiles", &pGe.gFastaFiles)); + parArray.push_back(new ParameterInfoVector (-1, -1, "genomeChainFiles", &pGe.gChainFiles)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeSAindexNbases", &pGe.gSAindexNbases)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeChrBinNbits", &pGe.gChrBinNbits)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeSAsparseD", &pGe.gSAsparseD)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeSuffixLengthMax", &pGe.gSuffixLengthMax)); + parArray.push_back(new ParameterInfoVector (-1, -1, "genomeFileSizes", &pGe.gFileSizes)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "genomeConsensusFile", &pGe.gConsensusFile)); + + //read + parArray.push_back(new ParameterInfoVector (-1, -1, "readFilesType", &readFilesType)); + parArray.push_back(new ParameterInfoVector (-1, -1, "readFilesIn", &readFilesIn)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "readFilesPrefix", &readFilesPrefix)); + parArray.push_back(new ParameterInfoVector (-1, -1, "readFilesCommand", &readFilesCommand)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "readMatesLengthsIn", &readMatesLengthsIn)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "readMapNumber", &readMapNumber)); + parArray.push_back(new ParameterInfoVector (-1, -1, "readNameSeparator", &readNameSeparator)); + //parArray.push_back(new ParameterInfoScalar (-1, -1, "readStrand", &pReads.strandString)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "inputBAMfile", &inputBAMfile)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "bamRemoveDuplicatesType", &removeDuplicates.mode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "bamRemoveDuplicatesMate2basesN", &removeDuplicates.mate2basesN)); + //limits + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitGenomeGenerateRAM", &limitGenomeGenerateRAM)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitIObufferSize", &limitIObufferSize)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitOutSAMoneReadBytes", &limitOutSAMoneReadBytes)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitOutSJcollapsed", &limitOutSJcollapsed)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitOutSJoneRead", &limitOutSJoneRead)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitBAMsortRAM", &limitBAMsortRAM)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitSjdbInsertNsj", &limitSjdbInsertNsj)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "limitNreadsSoft", &limitNreadsSoft)); + + //output + parArray.push_back(new ParameterInfoScalar (-1, 2, "outFileNamePrefix", &outFileNamePrefix)); + parArray.push_back(new ParameterInfoScalar (-1, 2, "outTmpDir", &outTmpDir)); + parArray.push_back(new ParameterInfoScalar (-1, 2, "outTmpKeep", &outTmpKeep)); + parArray.push_back(new ParameterInfoScalar (-1, 2, "outStd", &outStd)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outReadsUnmapped", &outReadsUnmapped)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outQSconversionAdd", &outQSconversionAdd)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outMultimapperOrder", &outMultimapperOrder.mode)); + + //outSAM + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMtype", &outSAMtype)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMmode", &outSAMmode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMstrandField", &outSAMstrandField.in)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMattributes", &outSAMattributes)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMunmapped", &outSAMunmapped.mode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMorder", &outSAMorder)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMprimaryFlag", &outSAMprimaryFlag)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMreadID", &outSAMreadID)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMmapqUnique", &outSAMmapqUnique)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMflagOR", &outSAMflagOR)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMflagAND", &outSAMflagAND)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMattrRGline", &outSAMattrRGline)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMheaderHD", &outSAMheaderHD)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMheaderPG", &outSAMheaderPG)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMheaderCommentFile", &outSAMheaderCommentFile)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outBAMcompression", &outBAMcompression)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outBAMsortingThreadN", &outBAMsortingThreadN)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outBAMsortingBinsN", &outBAMsortingBinsN)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSAMfilter", &outSAMfilter.mode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMmultNmax", &outSAMmultNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMattrIHstart", &outSAMattrIHstart)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSAMtlen", &outSAMtlen)); + + + //output SJ filtering + parArray.push_back(new ParameterInfoScalar (-1, -1, "outSJfilterReads", &outSJfilterReads)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSJfilterCountUniqueMin", &outSJfilterCountUniqueMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSJfilterCountTotalMin", &outSJfilterCountTotalMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSJfilterOverhangMin", &outSJfilterOverhangMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSJfilterDistToOtherSJmin", &outSJfilterDistToOtherSJmin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outSJfilterIntronMaxVsReadN", &outSJfilterIntronMaxVsReadN)); + + //output wiggle + parArray.push_back(new ParameterInfoVector (-1, -1, "outWigType", &outWigType)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outWigStrand", &outWigStrand)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outWigReferencesPrefix", &outWigReferencesPrefix)); + parArray.push_back(new ParameterInfoVector (-1, -1, "outWigNorm", &outWigNorm)); + + //output filtering + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterType", &outFilterType) ); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMultimapNmax", &outFilterMultimapNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMultimapScoreRange", &outFilterMultimapScoreRange)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterScoreMin", &outFilterScoreMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterScoreMinOverLread", &outFilterScoreMinOverLread)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMatchNmin", &outFilterMatchNmin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMatchNminOverLread", &outFilterMatchNminOverLread)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMismatchNmax", &outFilterMismatchNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMismatchNoverLmax", &outFilterMismatchNoverLmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterMismatchNoverReadLmax", &outFilterMismatchNoverReadLmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterIntronMotifs", &outFilterIntronMotifs)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "outFilterIntronStrands", &outFilterIntronStrands)); + + //clipping + parArray.push_back(new ParameterInfoVector (-1, -1, "clip5pNbases", &clip5pNbases)); + parArray.push_back(new ParameterInfoVector (-1, -1, "clip3pNbases", &clip3pNbases)); + parArray.push_back(new ParameterInfoVector (-1, -1, "clip3pAfterAdapterNbases", &clip3pAfterAdapterNbases)); + parArray.push_back(new ParameterInfoVector (-1, -1, "clip3pAdapterSeq", &clip3pAdapterSeq)); + parArray.push_back(new ParameterInfoVector (-1, -1, "clip3pAdapterMMp", &clip3pAdapterMMp)); + + //binning, anchors, windows + parArray.push_back(new ParameterInfoScalar (-1, -1, "winBinNbits", &winBinNbits)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "winAnchorDistNbins", &winAnchorDistNbins)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "winFlankNbins", &winFlankNbins)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "winAnchorMultimapNmax", &winAnchorMultimapNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "winReadCoverageRelativeMin", &winReadCoverageRelativeMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "winReadCoverageBasesMin", &winReadCoverageBasesMin)); + + //scoring + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreGap", &scoreGap)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreGapNoncan", &scoreGapNoncan)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreGapGCAG", &scoreGapGCAG)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreGapATAC", &scoreGapATAC)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreStitchSJshift", &scoreStitchSJshift)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreGenomicLengthLog2scale", &scoreGenomicLengthLog2scale)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreDelBase", &scoreDelBase)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreDelOpen", &scoreDelOpen)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreInsOpen", &scoreInsOpen)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "scoreInsBase", &scoreInsBase)); + + //alignment + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedSearchLmax", &seedSearchLmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedSearchStartLmax", &seedSearchStartLmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedSearchStartLmaxOverLread", &seedSearchStartLmaxOverLread)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedPerReadNmax", &seedPerReadNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedPerWindowNmax", &seedPerWindowNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedNoneLociPerWindow", &seedNoneLociPerWindow)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedMultimapNmax", &seedMultimapNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "seedSplitMin", &seedSplitMin)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignIntronMin", &alignIntronMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignIntronMax", &alignIntronMax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignMatesGapMax", &alignMatesGapMax)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignTranscriptsPerReadNmax", &alignTranscriptsPerReadNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignSJoverhangMin", &alignSJoverhangMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignSJDBoverhangMin", &alignSJDBoverhangMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "alignSJstitchMismatchNmax", &alignSJstitchMismatchNmax)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignSplicedMateMapLmin", &alignSplicedMateMapLmin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignSplicedMateMapLminOverLmate", &alignSplicedMateMapLminOverLmate)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignWindowsPerReadNmax", &alignWindowsPerReadNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignTranscriptsPerWindowNmax", &alignTranscriptsPerWindowNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignEndsType", &alignEndsType.in)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignSoftClipAtReferenceEnds", &alignSoftClipAtReferenceEnds.in)); + + parArray.push_back(new ParameterInfoVector (-1, -1, "alignEndsProtrude", &alignEndsProtrude.in)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "alignInsertionFlush", &alignInsertionFlush.in)); + + //peOverlap + parArray.push_back(new ParameterInfoScalar (-1, -1, "peOverlapNbasesMin", &peOverlap.NbasesMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "peOverlapMMp", &peOverlap.MMp)); + + //chimeric + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimSegmentMin", &pCh.segmentMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimScoreMin", &pCh.scoreMin)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimScoreDropMax", &pCh.scoreDropMax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimScoreSeparation", &pCh.scoreSeparation)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimScoreJunctionNonGTAG", &pCh.scoreJunctionNonGTAG)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimMainSegmentMultNmax", &pCh.mainSegmentMultNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimJunctionOverhangMin", &pCh.junctionOverhangMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "chimOutType", &pCh.out.type)); + parArray.push_back(new ParameterInfoVector (-1, -1, "chimFilter", &pCh.filter.stringIn)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimSegmentReadGapMax", &pCh.segmentReadGapMax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimMultimapNmax", &pCh.multimapNmax)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimMultimapScoreRange", &pCh.multimapScoreRange)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "chimNonchimScoreDropMin", &pCh.nonchimScoreDropMin)); + parArray.push_back(new ParameterInfoVector (-1, -1, "chimOutJunctionFormat", &pCh.outJunctionFormat)); + + //sjdb + parArray.push_back(new ParameterInfoVector (-1, -1, "sjdbFileChrStartEnd", &pGe.sjdbFileChrStartEnd)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbGTFfile", &pGe.sjdbGTFfile)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbGTFchrPrefix", &pGe.sjdbGTFchrPrefix)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbGTFfeatureExon", &pGe.sjdbGTFfeatureExon)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbGTFtagExonParentTranscript", &pGe.sjdbGTFtagExonParentTranscript)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbGTFtagExonParentGene", &pGe.sjdbGTFtagExonParentGene)); + parArray.push_back(new ParameterInfoVector (-1, -1, "sjdbGTFtagExonParentGeneName", &pGe.sjdbGTFtagExonParentGeneName)); + parArray.push_back(new ParameterInfoVector (-1, -1, "sjdbGTFtagExonParentGeneType", &pGe.sjdbGTFtagExonParentGeneType)); + + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbOverhang", &pGe.sjdbOverhang)); + pGe.sjdbOverhang_par=parArray.size()-1; + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbScore", &pGe.sjdbScore)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "sjdbInsertSave", &pGe.sjdbInsertSave)); + + //variation + parArray.push_back(new ParameterInfoScalar (-1, -1, "varVCFfile", &var.vcfFile)); + + //WASP + parArray.push_back(new ParameterInfoScalar (-1, -1, "waspOutputMode", &wasp.outputMode)); + + //quant + parArray.push_back(new ParameterInfoVector (-1, -1, "quantMode", &quant.mode)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "quantTranscriptomeBAMcompression", &quant.trSAM.bamCompression)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "quantTranscriptomeBan", &quant.trSAM.ban)); + + //2-pass + parArray.push_back(new ParameterInfoScalar (-1, -1, "twopass1readsN", &twoPass.pass1readsN)); + twoPass.pass1readsN_par=parArray.size()-1; + parArray.push_back(new ParameterInfoScalar (-1, -1, "twopassMode", &twoPass.mode)); + + //solo + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloType", &pSolo.typeStr)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloCBstart", &pSolo.cbS)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloUMIstart", &pSolo.umiS)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloCBlen", &pSolo.cbL)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloUMIlen", &pSolo.umiL)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloBarcodeReadLength", &pSolo.bL)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloCBwhitelist", &pSolo.soloCBwhitelist)); + parArray.push_back(new ParameterInfoScalar (-1, -1, "soloStrand", &pSolo.strandStr)); + parArray.push_back(new ParameterInfoVector (-1, -1, "soloOutFileNames", &pSolo.outFileNames)); + parArray.push_back(new ParameterInfoVector (-1, -1, "soloFeatures", &pSolo.featureIn)); + parArray.push_back(new ParameterInfoVector (-1, -1, "soloUMIdedup", &pSolo.umiDedup)); + + parameterInputName.push_back("Default"); + parameterInputName.push_back("Command-Line-Initial"); + parameterInputName.push_back("Command-Line"); + parameterInputName.push_back("genomeParameters.txt"); + +}; + +Parameters::~Parameters() { + // TODO cleanup sjNovel*? clip3pAdapterSeqNum? + for (auto *p : parArray) { + if (p != nullptr) { + delete p; + } + } + if (inOut != nullptr) { + delete inOut; + } +} + + +void Parameters::inputParameters (int argInN, char* argIn[]) {//input parameters: default, from files, from command line + +///////// Default parameters + #include "parametersDefault.xxd" + string parString( (const char*) parametersDefault,parametersDefault_len); + stringstream parStream (parString); + + scanAllLines(parStream, 0, -1); + for (uint ii=0; iiinputLevel<0) { + ostringstream errOut; + errOut <<"BUG: DEFAULT parameter value not defined: "<nameString; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + +///////// Initial parameters from Command Line + + commandLine=""; + string commandLineFile=""; + + if (argInN>1) {//scan parameters from command line + commandLine += string(argIn[0]); + for (int iarg=1; iarglogMain.open((outFileNamePrefix + "Log.out").c_str()); + if (inOut->logMain.fail()) { + ostringstream errOut; + errOut <<"EXITING because of FATAL ERROR: could not create output file: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + inOut->logMain << "STAR version=" << STAR_VERSION << "\n"; + inOut->logMain << "STAR compilation time,server,dir=" << COMPILATION_TIME_PLACE << "\n"; + #ifdef COMPILE_FOR_LONG_READS + inOut->logMain << "Compiled for LONG reads" << "\n"; + #endif + + //define what goes to cout + if (outStd=="Log") { + inOut->logStdOut=& std::cout; + } else if (outStd=="SAM" || outStd=="BAM_Unsorted" || outStd=="BAM_SortedByCoordinate" || outStd=="BAM_Quant") { + inOut->logStdOutFile.open((outFileNamePrefix + "Log.std.out").c_str()); + inOut->logStdOut= & inOut->logStdOutFile; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL PARAMETER error: outStd="<logMain, EXIT_CODE_PARAMETER, *this); + }; + + /* + inOut->logMain << "##### DEFAULT parameters:\n" <inputLevel==0) { + inOut->logMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl; + }; + }; + */ + + inOut->logMain <<"##### Command Line:\n"<logMain << "##### Initial USER parameters from Command Line:\n"; + for (uint ii=0; iiinputLevel==1) { + inOut->logMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl; + }; + }; + +///////// Parameters files + + if (parametersFiles.at(0) != "-") {//read parameters from a user-defined file + for (uint ii=0; iilogMain << "##### USER parameters from user-defined parameters file " <logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + }; + +///////// Command Line Final + + if (argInN>1) {//scan all parameters from command line and override previuos values + inOut->logMain << "###### All USER parameters from Command Line:\n" <logMain << "##### Finished reading parameters from all sources\n\n" << flush; + + inOut->logMain << "##### Final user re-defined parameters-----------------:\n" << flush; + + ostringstream clFull; + clFull << argIn[0]; + for (uint ii=0; iiinputLevel>0) { + inOut->logMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl; + if (parArray[ii]->nameString != "parametersFiles" ) { + clFull << " --" << parArray[ii]->nameString << " " << *(parArray[ii]); + }; + }; + }; + commandLineFull=clFull.str(); + inOut->logMain << "\n-------------------------------\n##### Final effective command line:\n" << clFull.str() << "\n"; + + /* + // parOut.close(); + inOut->logMain << "\n##### Final parameters after user input--------------------------------:\n" << flush; + // parOut.open("Parameters.all.out"); + for (uint ii=0; iilogMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl; + }; + // parOut.close(); + */ + + inOut->logMain << "----------------------------------------\n\n" << flush; + + + ///////////////////////////////////////// Old variables + //splitting + Qsplit=0; + maxNsplit=10; + minLsplit=seedSplitMin; + minLmap=5; + + + +////////////////////////////////////////////////////// Calculate and check parameters + iReadAll=0; + + if (runDirPermIn=="User_RWX") + { + runDirPerm=S_IRWXU; + } else if (runDirPermIn=="All_RWX") + { + runDirPerm= S_IRWXU | S_IRWXG | S_IRWXO; + } else + { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT ERROR: unrecognized option in --runDirPerm=" << runDirPerm << "\n"; + errOut << "SOLUTION: use one of the allowed values of --runDirPerm : 'User_RWX' or 'All_RWX' \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + /* + if (outTmpDir=="-") { + outFileTmp=outFileNamePrefix +"_STARtmp/"; + sysRemoveDir (outFileTmp); + } else { + outFileTmp=outTmpDir + "/"; + }; + */ + /* + if (mkdir (outFileTmp.c_str(),runDirPerm)!=0) { + ostringstream errOut; + errOut <<"EXITING because of fatal ERROR: could not make temporary directory: "<< outFileTmp<<"\n"; + errOut <<"SOLUTION: (i) please check the path and writing permissions \n (ii) if you specified --outTmpDir, and this directory exists - please remove it before running STAR\n"<logMain, EXIT_CODE_PARAMETER, *this); + }; + */ + //g_threadChunks.threadBool=(runThreadN>1); + + //wigOut parameters + if (outWigType.at(0)=="None") { + outWigFlags.yes=false; + } else if (outWigType.at(0)=="bedGraph") { + outWigFlags.yes=true; + outWigFlags.format=0; + } else if (outWigType.at(0)=="wiggle") { + outWigFlags.yes=true; + outWigFlags.format=1; + } else { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT ERROR: unrecognized option in --outWigType=" << outWigType.at(0) << "\n"; + errOut << "SOLUTION: use one of the allowed values of --outWigType : 'None' or 'bedGraph' \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + if (outWigStrand.at(0)=="Stranded") { + outWigFlags.strand=true; + } else if (outWigStrand.at(0)=="Unstranded") { + outWigFlags.strand=false; + } else { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT ERROR: unrecognized option in --outWigStrand=" << outWigStrand.at(0) << "\n"; + errOut << "SOLUTION: use one of the allowed values of --outWigStrand : 'Stranded' or 'Unstranded' \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (outWigType.size()==1) {//simple bedGraph + outWigFlags.type=0; + } else { + if (outWigType.at(1)=="read1_5p") { + outWigFlags.type=1; + } else if (outWigType.at(1)=="read2") { + outWigFlags.type=2; + } else { + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT ERROR: unrecognized second option in --outWigType=" << outWigType.at(1) << "\n"; + errOut << "SOLUTION: use one of the allowed values of --outWigType : 'read1_5p' \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + //wigOut parameters + if (outWigNorm.at(0)=="None") { + outWigFlags.norm=0; + } else if (outWigNorm.at(0)=="RPM") { + outWigFlags.norm=1; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal parameter ERROR: unrecognized option in --outWigNorm=" << outWigNorm.at(0) << "\n"; + errOut << "SOLUTION: use one of the allowed values of --outWigNorm : 'None' or 'RPM' \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + + //remove duplicates parameters + if (removeDuplicates.mode=="UniqueIdentical") + { + removeDuplicates.yes=true; + removeDuplicates.markMulti=true; + } else if (removeDuplicates.mode=="UniqueIdenticalNotMulti") + { + removeDuplicates.yes=true; + removeDuplicates.markMulti=false; + }; + + if (runMode=="alignReads") { + inOut->logProgress.open((outFileNamePrefix + "Log.progress.out").c_str()); + } + + outSAMbool=false; + outBAMunsorted=false; + outBAMcoord=false; + if (runMode=="alignReads" && outSAMmode != "None") {//open SAM file and write header + if (outSAMtype.at(0)=="BAM") { + if (outSAMtype.size()<2) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETER error: missing BAM option\n"; + errOut <<"SOLUTION: re-run STAR with one of the allowed values of --outSAMtype BAM Unsorted OR SortedByCoordinate OR both\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + for (uint32 ii=1; iilogMain, EXIT_CODE_PARAMETER, *this); + }; + }; + //TODO check for conflicts + if (outBAMunsorted) { + if (outStd=="BAM_Unsorted") { + outBAMfileUnsortedName="-"; + } else { + outBAMfileUnsortedName=outFileNamePrefix + "Aligned.out.bam"; + }; + throw std::runtime_error("Unimplemented!"); + //inOut->outBAMfileUnsorted = bgzf_open(outBAMfileUnsortedName.c_str(),("w"+to_string((long long) outBAMcompression)).c_str()); + }; + if (outBAMcoord) { + if (outStd=="BAM_SortedByCoordinate") { + outBAMfileCoordName="-"; + } else { + outBAMfileCoordName=outFileNamePrefix + "Aligned.sortedByCoord.out.bam"; + }; + throw std::runtime_error("Unimplemented!"); + //inOut->outBAMfileCoord = bgzf_open(outBAMfileCoordName.c_str(),("w"+to_string((long long) outBAMcompression)).c_str()); + if (outBAMsortingThreadN==0) { + outBAMsortingThreadNactual=min(6, runThreadN); + } else { + outBAMsortingThreadNactual=outBAMsortingThreadN; + }; + outBAMcoordNbins=max((uint32)outBAMsortingThreadNactual*3,outBAMsortingBinsN); + outBAMsortingBinStart= new uint64 [outBAMcoordNbins]; + outBAMsortingBinStart[0]=1;//this initial value means that the bin sizes have not been determined yet + + outBAMsortTmpDir=outFileTmp+"/BAMsort/"; + mkdir(outBAMsortTmpDir.c_str(),runDirPerm); + }; + } else if (outSAMtype.at(0)=="SAM") { + if (outSAMtype.size()>1) + { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETER error: --outSAMtype SAM can cannot be combined with "<logMain, EXIT_CODE_PARAMETER, *this); + }; + outSAMbool=true; + if (outStd=="SAM") { + inOut->outSAM = & std::cout; + } else { + inOut->outSAMfile.open((outFileNamePrefix + "Aligned.out.sam").c_str()); + inOut->outSAM = & inOut->outSAMfile; + }; + } else if (outSAMtype.at(0)=="None") { + //nothing to do, all flags are already false + } else { + ostringstream errOut; + errOut <<"EXITING because of fatal input ERROR: unknown value for the first word of outSAMtype: "<< outSAMtype.at(0) <<"\n"; + errOut <<"SOLUTION: re-run STAR with one of the allowed values of outSAMtype: BAM or SAM \n"<logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + if (!outBAMcoord && outWigFlags.yes && runMode=="alignReads") { + ostringstream errOut; + errOut <<"SOLUTION: re-run STAR with with --outSAMtype BAM SortedByCoordinate, or, id you also need unsroted BAM, with --outSAMtype BAM SortedByCoordinate Unsorted\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + //versions + for (uint ii=0;ii<1;ii++) { + if (parArray[ii]->inputLevel>0) { + ostringstream errOut; + errOut <<"EXITING because of fatal input ERROR: the version parameter "<< parArray[ii]->nameString << " cannot be re-defined by the user\n"; + errOut <<"SOLUTION: please remove this parameter from the command line or input files and re-start STAR\n"<logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + //run + if (runThreadN<=0) { + ostringstream errOut; + errOut <<"EXITING: fatal input ERROR: runThreadN must be >0, user-defined runThreadN="<logMain, EXIT_CODE_PARAMETER, *this); + }; + + // + if (outFilterType=="BySJout" && outSAMorder=="PairedKeepInputOrder") { + ostringstream errOut; + errOut <<"EXITING: fatal input ERROR: --outFilterType=BySJout is not presently compatible with --outSAMorder=PairedKeepInputOrder\n"; + errOut <<"SOLUTION: re-run STAR without setting one of those parameters.\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + if (!outSAMbool && outSAMorder=="PairedKeepInputOrder") { + ostringstream errOut; + errOut <<"EXITING: fatal input ERROR: --outSAMorder=PairedKeepInputOrder is presently only compatible with SAM output, i.e. default --outSMAtype SAM\n"; + errOut <<"SOLUTION: re-run STAR without --outSAMorder=PairedKeepInputOrder, or with --outSAMorder=PairedKeepInputOrder --outSMAtype SAM .\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + //SJ filtering + for (int ii=0;ii<4;ii++) { + if (outSJfilterOverhangMin.at(ii)<0) outSJfilterOverhangMin.at(ii)=numeric_limits::max(); + if (outSJfilterCountUniqueMin.at(ii)<0) outSJfilterCountUniqueMin.at(ii)=numeric_limits::max(); + if (outSJfilterCountTotalMin.at(ii)<0) outSJfilterCountTotalMin.at(ii)=numeric_limits::max(); + if (outSJfilterDistToOtherSJmin.at(ii)<0) outSJfilterDistToOtherSJmin.at(ii)=numeric_limits::max(); + + if (alignSJstitchMismatchNmax.at(ii)<0) alignSJstitchMismatchNmax.at(ii)=numeric_limits::max(); + }; + + if (limitGenomeGenerateRAM==0) {//must be >0 + inOut->logMain <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM=0\n"; + inOut->logMain <<"SOLUTION: please specify a >0 value for limitGenomeGenerateRAM\n"<1000000000000) {// + inOut->logMain <<"WARNING: specified limitGenomeGenerateRAM="<0) ++ii;//skip comma + outSAMattrRGlineSplit.push_back(outSAMattrRGline.at(ii)); //star new RG line with the first field which must be ID:xxx + if (outSAMattrRGlineSplit.back().substr(0,3)!="ID:") { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: the first word of a line from --outSAMattrRGline="<logMain, EXIT_CODE_PARAMETER, *this); + }; + outSAMattrRG.push_back(outSAMattrRGlineSplit.back().substr(3)); //this adds the ID field + } else {//keep adding fields to this RG line, until the next comma + outSAMattrRGlineSplit.back()+="\t" + outSAMattrRGline.at(ii); + }; + }; + }; + }; + + outSAMfilter.KeepOnlyAddedReferences=false; + outSAMfilter.KeepAllAddedReferences=false; + outSAMfilter.yes=true; + if (outSAMfilter.mode.at(0)=="KeepOnlyAddedReferences") + { + outSAMfilter.KeepOnlyAddedReferences=true; + } else if (outSAMfilter.mode.at(0)=="KeepAllAddedReferences") + { + outSAMfilter.KeepAllAddedReferences=true; + } else if (outSAMfilter.mode.at(0)=="None") + { + outSAMfilter.yes=false; + } else + { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for --outSAMfilter: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if ( (outSAMfilter.KeepOnlyAddedReferences || outSAMfilter.KeepAllAddedReferences) && pGe.gFastaFiles.at(0)=="-" ) { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: --outSAMfilter KeepOnlyAddedReferences OR KeepAllAddedReferences options can only be used if references are added on-the-fly with --genomeFastaFiles" <<"\n"; + errOut <<"SOLUTION: use default --outSAMfilter None, OR add references with --genomeFataFiles\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + + if (outMultimapperOrder.mode=="Old_2.4") + { + outMultimapperOrder.random=false; + } else if (outMultimapperOrder.mode=="Random") + { + outMultimapperOrder.random=true; + } else + { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for --outMultimapperOrder: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + //read parameters + if (readFilesType.at(0)=="Fastx") { + readFilesTypeN=1; + } else if (readFilesType.at(0)=="SAM"){ + readFilesTypeN=10; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for --readFilesType: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (readFilesTypeN==1) { + readNmates=readFilesIn.size(); //for now the number of mates is defined by the number of input files + } else if (readFilesTypeN==10) {//find the number of mates from the SAM file + if (readFilesType.size()==2 && readFilesType.at(1)=="SE") { + readNmates=1; + } else if (readFilesType.size()==2 && readFilesType.at(1)=="PE") { + readNmates=2; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: --readFilesType SAM requires specifying SE or PE reads"<<"\n"; + errOut <<"SOLUTION: specify --readFilesType SAM SE for single-end reads or --readFilesType SAM PE for paired-end reads\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + readNmatesIn=readNmates; + + //two-pass + if (parArray.at(twoPass.pass1readsN_par)->inputLevel>0 && twoPass.mode=="None") + { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: --twopass1readsN is defined, but --twoPassMode is not defined\n"; + errOut << "SOLUTION: to activate the 2-pass mode, use --twopassMode Basic"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + twoPass.yes=false; + twoPass.pass2=false; + if (twoPass.mode!="None") {//2-pass parameters + if (runMode!="alignReads") + { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: 2-pass mapping option can only be used with --runMode alignReads\n"; + errOut << "SOLUTION: remove --twopassMode option"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (twoPass.mode!="Basic") + { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized value of --twopassMode="<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (twoPass.pass1readsN==0) + { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: --twopass1readsN = 0 in the 2-pass mode\n"; + errOut << "SOLUTION: for the 2-pass mode, specify --twopass1readsN > 0. Use a very large number or -1 to map all reads in the 1st pass.\n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (pGe.gLoad!="NoSharedMemory") { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: 2-pass method is not compatible with --genomeLoad "<logMain, EXIT_CODE_PARAMETER, *this); + }; + /* + twoPass.yes=true; + twoPass.dir=outFileNamePrefix+"_STARpass1/"; + sysRemoveDir (twoPass.dir); + if (mkdir (twoPass.dir.c_str(),runDirPerm)!=0) { + ostringstream errOut; + errOut <<"EXITING because of fatal ERROR: could not make pass1 directory: "<< twoPass.dir<<"\n"; + errOut <<"SOLUTION: please check the path and writing permissions \n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + */ + }; + + // openReadFiles depends on twoPass for reading SAM header + if (runMode=="alignReads" && pGe.gLoad!="Remove" && pGe.gLoad!="LoadAndExit") {//open reads files to check if they are present + //openReadsFiles(); + readNmates = 2; + //check sizes of the mate files, if not the same, assume mates are not the same length + if (readNmates==1) { + readMatesEqualLengths=true; + } else if (readNmates > 2){ + ostringstream errOut; + errOut <<"EXITING: because of fatal input ERROR: number of read mates files > 2: " <logMain, EXIT_CODE_PARAMETER, *this); + } else if (readMatesLengthsIn=="Equal") { + readMatesEqualLengths=true; + } else if (readMatesLengthsIn=="NotEqual") { + readMatesEqualLengths=false; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL input ERROR: the value of the parameter readMatesLengthsIn=" << readMatesLengthsIn <<" is not among the allowed values: Equal or NotEqual\n"; + errOut <<"SOLUTION: specify one of the allowed values: Equal or NotEqual\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if ( runMode=="alignReads" && outReadsUnmapped=="Fastx" ) {//open unmapped reads file + for (uint imate=0;imateoutUnmappedReadsStream[imate].open(ff.str().c_str()); + }; + }; + + + if (outFilterType=="Normal") { + outFilterBySJoutStage=0; + } else if (outFilterType=="BySJout") { + outFilterBySJoutStage=1; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL input ERROR: unknown value of parameter outFilterType: " << outFilterType <<"\n"; + errOut <<"SOLUTION: specify one of the allowed values: Normal | BySJout\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + if (outSAMmapqUnique<0 || outSAMmapqUnique>255) { + ostringstream errOut; + errOut <<"EXITING because of FATAL input ERROR: out of range value for outSAMmapqUnique=" << outSAMmapqUnique <<"\n"; + errOut <<"SOLUTION: specify outSAMmapqUnique within the range of 0 to 255\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + // in/out buffers + #define BUFFER_InSizeFraction 0.5 + if (limitIObufferSizelogMain, EXIT_CODE_PARAMETER, *this); + }; + chunkInSizeBytesArray=(uint) ((int64_t)((limitIObufferSize-limitOutSJcollapsed*Junction::dataSize)*BUFFER_InSizeFraction)/2); + chunkOutBAMsizeBytes= (uint) ((int64_t)((1.0/BUFFER_InSizeFraction-1.0)*chunkInSizeBytesArray*2.0)); + chunkInSizeBytes=chunkInSizeBytesArray-2*(DEF_readSeqLengthMax+1)-2*DEF_readNameLengthMax;//to prevent overflow + + //basic trimming + if (clip5pNbases.size()==1 && readNmates==2) clip5pNbases.push_back(clip5pNbases[0]); + if (clip3pNbases.size()==1 && readNmates==2) clip3pNbases.push_back(clip3pNbases[0]); + if (clip3pAfterAdapterNbases.size()==1 && readNmates==2) clip3pAfterAdapterNbases.push_back(clip3pAfterAdapterNbases[0]); + + //adapter clipping + if (clip3pAdapterSeq.size()==1 && readNmates==2) clip3pAdapterSeq.push_back(clip3pAdapterSeq[0]); + if (clip3pAdapterMMp.size()==1 && readNmates==2) clip3pAdapterMMp.push_back(clip3pAdapterMMp[0]); + for (uint imate=0;imatefastaOutSeqs.open("Seqs.out.fasta"); + }; + }; + + //variation + var.yes=false; + if (var.vcfFile!="-") + { + var.yes=true; + }; + + //WASP + wasp.yes=false; + wasp.SAMtag=false; + if (wasp.outputMode=="SAMtag") { + wasp.yes=true; + wasp.SAMtag=true; + } else if (wasp.outputMode=="None") { + //nothing to do + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented --waspOutputMode option: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (wasp.yes && !var.yes) { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: --waspOutputMode option requires VCF file: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (wasp.yes && outSAMtype.at(0)!="BAM") { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: --waspOutputMode requires output to BAM file\n"; + errOut <<"SOLUTION: re-run STAR with --waspOutputMode ... and --outSAMtype BAM ... \n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + //quantification parameters + quant.yes=false; + quant.geneFull.yes=false; + quant.geCount.yes=false; + quant.trSAM.yes=false; + quant.trSAM.bamYes=false; + quant.trSAM.indel=false; + quant.trSAM.softClip=false; + quant.trSAM.singleEnd=false; + if (quant.mode.at(0) != "-") { + quant.yes=true; + for (uint32 ii=0; ii-2) + quant.trSAM.bamYes=true; + + if (quant.trSAM.bamYes) { + if (outStd=="BAM_Quant") { + outFileNamePrefix="-"; + } else { + outQuantBAMfileName=outFileNamePrefix + "Aligned.toTranscriptome.out.bam"; + }; + throw std::runtime_error("Unimplemented!"); + //inOut->outQuantBAMfile=bgzf_open(outQuantBAMfileName.c_str(),("w"+to_string((long long) quant.trSAM.bamCompression)).c_str()); + }; + if (quant.trSAM.ban=="IndelSoftclipSingleend") { + quant.trSAM.indel=false; + quant.trSAM.softClip=false; + quant.trSAM.singleEnd=false; + } else if (quant.trSAM.ban=="Singleend") { + quant.trSAM.indel=true; + quant.trSAM.softClip=true; + quant.trSAM.singleEnd=false; + }; + } else if (quant.mode.at(ii)=="GeneCounts") { + quant.geCount.yes=true; + quant.geCount.outFile=outFileNamePrefix + "ReadsPerGene.out.tab"; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal INPUT error: unrecognized option in --quantMode=" << quant.mode.at(ii) << "\n"; + errOut << "SOLUTION: use one of the allowed values of --quantMode : TranscriptomeSAM or GeneCounts or - .\n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + }; + + outSAMstrandField.type=0; //none + if (outSAMstrandField.in=="None") { + outSAMstrandField.type=0; + } else if (outSAMstrandField.in=="intronMotif") { + outSAMstrandField.type=1; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal INPUT error: unrecognized option in outSAMstrandField=" << outSAMstrandField.in << "\n"; + errOut << "SOLUTION: use one of the allowed values of --outSAMstrandField : None or intronMotif \n"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + //solo + pSolo.initialize(this); + + //outSAMattributes + outSAMattrPresent.NH=false;//TODO re-write as class with constructor? + outSAMattrPresent.HI=false; + outSAMattrPresent.AS=false; + outSAMattrPresent.NM=false; + outSAMattrPresent.MD=false; + outSAMattrPresent.nM=false; + outSAMattrPresent.jM=false; + outSAMattrPresent.jI=false; + outSAMattrPresent.RG=false; + outSAMattrPresent.MC=false; + outSAMattrPresent.XS=false; + outSAMattrPresent.vA=false; + outSAMattrPresent.vG=false; + outSAMattrPresent.vW=false; + outSAMattrPresent.ch=false; + outSAMattrPresent.rB=false; + outSAMattrPresent.CR=false; + outSAMattrPresent.CY=false; + outSAMattrPresent.UR=false; + outSAMattrPresent.UY=false; + + + //for quant SAM output only NH and HI flags + outSAMattrPresentQuant=outSAMattrPresent; + outSAMattrPresentQuant.NH=true; + outSAMattrPresentQuant.HI=true; + outSAMattrOrderQuant.push_back(ATTR_NH); + outSAMattrOrderQuant.push_back(ATTR_HI); + + vector vAttr1; + if (outSAMattributes.at(0)=="None") { + } else if (outSAMattributes.at(0)=="All"){ + vAttr1={"NH","HI","AS","nM","NM","MD","jM","jI","MC","ch"}; + } else if (outSAMattributes.at(0)=="Standard"){ + vAttr1={"NH","HI","AS","nM"}; + } else { + vAttr1=outSAMattributes; + }; + + for (uint ii=0;iilogMain << "WARNING --outSAMattributes contains XS, therefore STAR will use --outSAMstrandField intronMotif" <logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + if (!var.yes && (outSAMattrPresent.vA | outSAMattrPresent.vG)) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETER error: --outSAMattributes contains vA and/or vG tag(s), but --varVCFfile is not set\n"; + errOut <<"SOLUTION: re-run STAR with a --varVCFfile option, or without vA/vG tags in --outSAMattributes\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (!wasp.yes && outSAMattrPresent.vW) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETER error: --outSAMattributes contains vW tag, but --waspOutputMode is not set\n"; + errOut <<"SOLUTION: re-run STAR with a --waspOutputMode option, or without vW tags in --outSAMattributes\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (outSAMattrRG.size()>0 && !outSAMattrPresent.RG) { + outSAMattrOrder.push_back(ATTR_RG); + outSAMattrOrderQuant.push_back(ATTR_RG); + outSAMattrPresent.RG=true; + inOut->logMain << "WARNING --outSAMattrRG defines a read group, therefore STAR will output RG attribute" <logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (outSAMstrandField.type==1 && !outSAMattrPresent.XS) { + outSAMattrOrder.push_back(ATTR_XS); + inOut->logMain << "WARNING --outSAMstrandField=intronMotif, therefore STAR will output XS attribute" <logMain << "WARNING --waspOutputMode is set, therefore STAR will output vW attribute" <logMain, EXIT_CODE_PARAMETER, *this); + }; + + //initialize chimeric parameters + pCh.initialize(this); + + alignEndsType.ext[0][0]=false; + alignEndsType.ext[0][1]=false; + alignEndsType.ext[1][0]=false; + alignEndsType.ext[1][1]=false; + + if (alignEndsType.in=="EndToEnd") + { + alignEndsType.ext[0][0]=true; + alignEndsType.ext[0][1]=true; + alignEndsType.ext[1][0]=true; + alignEndsType.ext[1][1]=true; + } else if (alignEndsType.in=="Extend5pOfRead1" ) + { + alignEndsType.ext[0][0]=true; + } else if (alignEndsType.in=="Extend5pOfReads12" ) + { + alignEndsType.ext[0][0]=true; + alignEndsType.ext[1][0]=true; + } else if (alignEndsType.in=="Extend3pOfRead1" ) + { + alignEndsType.ext[0][1]=true; + } else if (alignEndsType.in=="Local") + { + //nothing to do for now + } else + { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for --alignEndsType: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + +// #ifdef COMPILE_NO_SHM +// if (pGe.gLoad!="NoSharedMemory") { +// ostringstream errOut; +// errOut <<"EXITING because of FATAL INPUT ERROR: The code was compiled with NO SHARED MEMORY support, but pGe.gLoad="<logMain, EXIT_CODE_PARAMETER, *this); +// }; +// #endif + + //open compilation-dependent streams + #ifdef OUTPUT_localChains + inOut->outLocalChains.open((outFileNamePrefix + "LocalChains.out.tab").c_str()); + #endif + +// genomeNumToNT={'A','C','G','T','N'}; + strcpy(genomeNumToNT,"ACGTN"); + + if (pGe.gLoad!="LoadAndKeep" && pGe.gLoad!="LoadAndRemove" && pGe.gLoad!="Remove" && pGe.gLoad!="LoadAndExit" && pGe.gLoad!="NoSharedMemory") {// find shared memory fragment + ostringstream errOut; + errOut << "EXITING because of FATAL INPUT ERROR: --genomeLoad=" << pGe.gLoad << "\n" <logMain, EXIT_CODE_PARAMETER, *this); + }; + + //sjdb insert on the fly + + sjdbInsert.pass1=false; + sjdbInsert.pass2=false; + sjdbInsert.yes=false; + if (pGe.sjdbFileChrStartEnd.at(0)!="-" || pGe.sjdbGTFfile!="-") + {//will insert annotated sjdb on the fly + sjdbInsert.pass1=true; + sjdbInsert.yes=true; + }; + if (twoPass.yes) + { + sjdbInsert.pass2=true; + sjdbInsert.yes=true; + }; + + if (pGe.gLoad!="NoSharedMemory" && sjdbInsert.yes ) + { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: on the fly junction insertion and 2-pass mappng cannot be used with shared memory genome \n" ; + errOut << "SOLUTION: run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (runMode=="alignReads" && sjdbInsert.yes ) + {//run-time genome directory, this is needed for genome files generated on the fly + if (pGe.sjdbOverhang<=0) { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: pGe.sjdbOverhang <=0 while junctions are inserted on the fly with --sjdbFileChrStartEnd or/and --sjdbGTFfile\n"; + errOut << "SOLUTION: specify pGe.sjdbOverhang>0, ideally readmateLength-1"; + exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + /* + sjdbInsert.outDir=outFileNamePrefix+"_STARgenome/"; + sysRemoveDir (sjdbInsert.outDir); + if (mkdir (sjdbInsert.outDir.c_str(),runDirPerm)!=0) { + ostringstream errOut; + errOut <<"EXITING because of fatal ERROR: could not make run-time genome directory directory: "<< sjdbInsert.outDir<<"\n"; + errOut <<"SOLUTION: please check the path and writing permissions \n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + */ + }; + + if (outBAMcoord && limitBAMsortRAM==0) {//check limitBAMsortRAM + if (pGe.gLoad!="NoSharedMemory") { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETERS error: limitBAMsortRAM=0 (default) cannot be used with --genomeLoad="< 10000000000 (i.e 10GB).\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + inOut->logMain<<"WARNING: --limitBAMsortRAM=0, will use genome size as RAM limit for BAM sorting\n"; + }; + + for (uint ii=0; iilogMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + + //outSAMunmapped + outSAMunmapped.yes=false; + outSAMunmapped.within=false; + outSAMunmapped.keepPairs=false; + if (outSAMunmapped.mode.at(0)=="None" && outSAMunmapped.mode.size()==1) { + //nothing to do, all false + } else if (outSAMunmapped.mode.at(0)=="Within" && outSAMunmapped.mode.size()==1) { + outSAMunmapped.yes=true; + outSAMunmapped.within=true; + } else if (outSAMunmapped.mode.at(0)=="Within" && outSAMunmapped.mode.at(1)=="KeepPairs") { + outSAMunmapped.yes=true; + outSAMunmapped.within=true; + if (readNmates==2) + outSAMunmapped.keepPairs=true; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized option for --outSAMunmapped="; + for (uint ii=0; iilogMain, EXIT_CODE_PARAMETER, *this); + }; + + alignEndsProtrude.nBasesMax=stoi(alignEndsProtrude.in.at(0),nullptr); + if (alignEndsProtrude.nBasesMax>0) {//allow ends protrusion + if (alignEndsProtrude.in.at(1)=="ConcordantPair") { + alignEndsProtrude.concordantPair=true; + } else if (alignEndsProtrude.in.at(1)=="DiscordantPair") { + alignEndsProtrude.concordantPair=false; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized option in of --alignEndsProtrude="<logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + + if (alignInsertionFlush.in=="None") { + alignInsertionFlush.flushRight=false; + } else if (alignInsertionFlush.in=="Right") { + alignInsertionFlush.flushRight=true; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized option in of --alignInsertionFlush="<logMain, EXIT_CODE_PARAMETER, *this); + }; + + + //peOverlap + if (peOverlap.NbasesMin>0) { + peOverlap.yes=true; + } else { + peOverlap.yes=false; + }; + + //alignSoftClipAtReferenceEnds.in + if (alignSoftClipAtReferenceEnds.in=="Yes") { + alignSoftClipAtReferenceEnds.yes=true; + } else if (alignSoftClipAtReferenceEnds.in=="No") { + alignSoftClipAtReferenceEnds.yes=false; + } else { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized option in of --alignSoftClipAtReferenceEnds="<logMain, EXIT_CODE_PARAMETER, *this); + }; + + outSAMreadIDnumber=false; + if (outSAMreadID=="Number") { + outSAMreadIDnumber=true; + }; + + //////////////////////////////////////////////// + inOut->logMain << "Finished loading and checking parameters\n" <> parIn; + if (parIn=="" || parIn.substr(0,2)=="//" || parIn.substr(0,1)=="#") return 0; //this is a comment + uint iPar; + for (iPar=0; iParnameString) {// + if (inputLevelRequested < 0 || inputLevelRequested == parArray[iPar]->inputLevelAllowed) { + break;//will read this parameter values + } else { + return 1; //do not read inputs not requested at this level + }; + }; + }; + + string parV(""); + lineInStream >> parV; + if (parV=="") {//parameter value cannot be empty + ostringstream errOut; + errOut << "EXITING: FATAL INPUT ERROR: empty value for parameter \""<< parIn << "\" in input \"" << parameterInputName.at(inputLevel) <<"\"\n"; + errOut << "SOLUTION: use non-empty value for this parameter\n"<logMain, EXIT_CODE_PARAMETER, *this); + }; + + lineInStream.str(lineIn); lineInStream.clear(); lineInStream >> parIn; //get the correct state of stream, past reading parIn + if (iPar==parArray.size()) {//string is not identified + ostringstream errOut; + errOut << "EXITING: FATAL INPUT ERROR: unrecognized parameter name \""<< parIn << "\" in input \"" << parameterInputName.at(inputLevel) <<"\"\n"; + errOut << "SOLUTION: use correct parameter name (check the manual)\n"<logMain, EXIT_CODE_PARAMETER, *this); + } else {//found the corresponding parameter + if (inputLevel==0 && parArray[iPar]->inputLevel>0) {//this is one of the initial parameters, it was read from Command Line and should not be re-defined + getline(lineInStream,parV); + inOut->logMain << setiosflags(ios::left) << setw(PAR_NAME_PRINT_WIDTH) << parArray[iPar]->nameString <inputLevelAllowed>0 && parArray[iPar]->inputLevelAllowed < inputLevel) {//this is initial parameter and cannot be redefined + ostringstream errOut; + errOut << "EXITING: FATAL INPUT ERROR: parameter \""<< parIn << "\" cannot be defined at the input level \"" << parameterInputName.at(inputLevel) << "\"\n"; + errOut << "SOLUTION: define parameter \""<< parIn << "\" in \"" << parameterInputName.at(parArray[iPar]->inputLevelAllowed) <<"\"\n" <logMain, EXIT_CODE_PARAMETER, *this); + } else if (parArray[iPar]->inputLevel==inputLevel) {//this parameter was already defined at this input level + ostringstream errOut; + errOut << "EXITING: FATAL INPUT ERROR: duplicate parameter \""<< parIn << "\" in input \"" << parameterInputName.at(inputLevel) << "\"\n"; + errOut << "SOLUTION: keep only one definition of input parameters in each input source\n"<logMain, EXIT_CODE_PARAMETER, *this); + } else {//read values + parArray[iPar]->inputValues(lineInStream); + parArray[iPar]->inputLevel=inputLevel; + if ( inOut->logMain.good() ) { + inOut->logMain << setiosflags(ios::left) << setw(PAR_NAME_PRINT_WIDTH) << parArray[iPar]->nameString << *(parArray[iPar]); + if ( parArray[iPar]->inputLevel > 0 ) inOut->logMain <<" ~RE-DEFINED"; + inOut->logMain << endl; + }; + }; + }; + return 0; +}; + + + diff --git a/star-sys/STAR/source/Parameters.h b/star-sys/STAR/source/Parameters.h new file mode 100755 index 0000000..671b389 --- /dev/null +++ b/star-sys/STAR/source/Parameters.h @@ -0,0 +1,341 @@ +#ifndef PARAMETERS_DEF +#define PARAMETERS_DEF + +#include "IncludeDefine.h" +#include "InOutStreams.h" +#include "ParameterInfo.h" +#include +#include "TimeFunctions.h" +#include +#include +#include "ParametersChimeric.h" +#include "ParametersSolo.h" +#include "ParametersGenome.h" +#include +#include + +class Parameters { + + public: + vector parArray, parArrayInitial; + vector parameterInputName; + + string commandLine, commandLineFull; + + //version + string versionGenome; + + //system parameters + string sysShell; //shell for executing system commands + + // run parameters + string runMode; + int runThreadN; + mode_t runDirPerm; + string runDirPermIn; //permission for directores created at run-time + int runRNGseed; //random number generator seed + + //parameters + vector parametersFiles; + + //input + string inputBAMfile; + + //genome + char genomeNumToNT[6]; + ParametersGenome pGe; + + //binning,windows,anchors + uint winBinChrNbits, winBinNbits, winAnchorDistNbins, winFlankNbins, winBinN; + uint winAnchorMultimapNmax; //max number of alignments for anchors + double winReadCoverageRelativeMin; + uint winReadCoverageBasesMin; + + //read parameters + vector readFilesType; + int readFilesTypeN; + string readFilesPrefix; + vector readFilesIn, readFilesInTmp; + uint32 readFilesN; + vector > readFilesNames; + vector readFilesCommand; + int readFilesIndex; + pid_t readFilesCommandPID[MAX_N_MATES]; + + uint readMapNumber; + uint iReadAll; + uint readNmates, readNmatesIn; + string readMatesLengthsIn; + + vector readNameSeparator; + vector readNameSeparatorChar; + + string outSAMreadID; + bool outSAMreadIDnumber; + + vector clip5pNbases, clip3pNbases, clip3pAfterAdapterNbases; + vector clip3pAdapterMMp; + vector clip3pAdapterSeq; + char *clip3pAdapterSeqNum[MAX_N_MATES];//adapter sequence - numerical + bool readMatesEqualLengths; //whether or not the read mates have the same length, true if onyl one mate + + //align parameters + uint alignSJoverhangMin,alignSJDBoverhangMin,alignSplicedMateMapLmin; //min SJ donor/acceptor length + double alignSplicedMateMapLminOverLmate; + uint alignWindowsPerReadNmax; //max number of alignment windows per read + uint alignTranscriptsPerWindowNmax; //maximum number of transcripts recorded per window + uint alignTranscriptsPerReadNmax; //max number of alignments per read + uint alignIntronMin;//min length to call a gap an intron + uint alignIntronMax;//max length to call + uint alignMatesGapMax;//max gap between the mates (if paired-end) + vector alignSJstitchMismatchNmax; + + // struct { + // string strandString; + // int32 strand; + // } pReads; + + struct { + string in; + bool yes; + } alignSoftClipAtReferenceEnds; + + struct { + string in; + bool ext[2][2]; + } alignEndsType; + + struct { + vector in; + int nBasesMax; + bool concordantPair; + } alignEndsProtrude; + + struct { + string in; + bool flushRight; + } alignInsertionFlush; + + + //seed parameters + uint seedMultimapNmax; //max number of multiple alignments per piece + uint seedSearchLmax; //max length of the seed + uint seedPerReadNmax; //max number of pieces per Read + uint seedPerWindowNmax; //max number of aligns per window + uint seedNoneLociPerWindow; //max number of aligns from one piece per window + uint seedSearchStartLmax; + double seedSearchStartLmaxOverLread; //length of split start points + uint seedSplitMin; + + //chunk parameters + uint chunkInSizeBytes,chunkInSizeBytesArray,chunkOutBAMsizeBytes; + + //output + string outFileNamePrefix, outStd; + string outTmpDir, outTmpKeep; + + //SAM output + string outBAMfileCoordName, outBAMfileUnsortedName, outQuantBAMfileName; + string samHeader, samHeaderHD, samHeaderSortedCoord, samHeaderExtra; + string outSAMmode, outSAMorder, outSAMprimaryFlag; + vector outSAMattributes, outSAMheaderHD, outSAMheaderPG; + vector outSAMattrRGline,outSAMattrRGlineSplit,outSAMattrRG; + uint outSAMmultNmax,outSAMattrIHstart; + string outSAMheaderCommentFile; + int outSAMmapqUnique; + + struct { + string in; + uint32 type; + } outSAMstrandField; + + int outSAMtlen; + + struct {bool NH,HI,AS,NM,MD,nM,jM,jI,RG,XS,rB,vG,vA,vW,ch,MC,CR,CY,UR,UY;} outSAMattrPresent, outSAMattrPresentQuant; + + vector outSAMattrOrder, outSAMattrOrderQuant; + int outBAMcompression; + vector outSAMtype; + bool outBAMunsorted, outBAMcoord, outSAMbool; + uint32 outBAMcoordNbins; + uint32 outBAMsortingBinsN;//user-defined number of bins for sorting + string outBAMsortTmpDir; + +// string bamRemoveDuplicatesType; +// uint bamRemoveDuplicatesMate2basesN; + struct { + string mode; + bool yes; + bool markMulti; + uint mate2basesN; + } removeDuplicates; + + int outBAMsortingThreadN, outBAMsortingThreadNactual; + uint64 *outBAMsortingBinStart; //genomic starts for bins for sorting BAM files + uint16 outSAMflagOR, outSAMflagAND; + + struct { + vector mode; + bool yes; + bool within;//output unmapped reads within SAM/BAM files + bool keepPairs;//keep mates together + } outSAMunmapped; + + struct { + vector mode; + bool yes; + bool KeepOnlyAddedReferences; + bool KeepAllAddedReferences; + } outSAMfilter; + + struct { + string mode; + bool random; + } outMultimapperOrder; + + struct { + bool yes; + uint NbasesMin; + double MMp; + } peOverlap; + + string outReadsUnmapped; + int outQSconversionAdd; + string outFileTmp; + + //output filtering + uint outFilterMismatchNmax; + double outFilterMismatchNoverLmax, outFilterMismatchNoverReadLmax; //max proportion of all MM within all bases + + uint outFilterMatchNmin,outFilterMultimapNmax;//min number of matches + double outFilterScoreMinOverLread, outFilterMatchNminOverLread;//normalzied to read length + intScore outFilterScoreMin,outFilterMultimapScoreRange;//min score to output + string outFilterIntronMotifs,outFilterIntronStrands; + string outFilterType; //type of filtering + int outFilterBySJoutStage; //indicates the stage of filtering by SJout + + //output filtering SJs + string outSJfilterReads; + vector outSJfilterCountUniqueMin, outSJfilterCountTotalMin; + vector outSJfilterOverhangMin; + vector outSJfilterDistToOtherSJmin; //min allowed distance to other SJ's donor/acceptor + vector outSJfilterIntronMaxVsReadN; + + //wiggle output + vector outWigType, outWigStrand, outWigNorm; + string outWigReferencesPrefix; + struct { + bool yes; + bool strand; + int type; + int format; + int norm; + } outWigFlags; + + //2-pass +// uint twoPass.pass1readsN, twoPass.sjLimit; +// string twoPass.dir,twopassSJpass1file; + struct { + bool yes; //true in 2-pass mode + bool pass2; //true if now running the 2nd pass + uint pass1readsN; + int pass1readsN_par; + string dir; + string pass1sjFile; + string mode; + } twoPass; + + //inserting junctions on the fly + struct { + bool yes; //insert? + bool pass1;//insert on the 1st pass? + bool pass2;//insert on the 2nd pass? + string outDir; + } sjdbInsert; + + //storage limits + uint limitGenomeGenerateRAM; + uint limitIObufferSize; //max size of the in/out buffer, bytes + uint limitOutSAMoneReadBytes; + uint limitOutSJoneRead, limitOutSJcollapsed; + uint limitBAMsortRAM; + uint limitSjdbInsertNsj; + uint limitNreadsSoft; + + // penalties + intScore scoreGap, scoreGapNoncan, scoreGapGCAG, scoreGapATAC, scoreDelBase, scoreDelOpen, scoreInsBase, scoreInsOpen; + intScore scoreStitchSJshift;//Max negative score when + double scoreGenomicLengthLog2scale; + + //quantification parameters + //input + + struct { + bool yes; //if any quantification is done + vector mode; //quantification mode input string + + struct { + bool yes; + bool bamYes; + bool indel; + bool softClip; + bool singleEnd; + int bamCompression; + string ban; + } trSAM; + + struct { + bool yes; + string outFile; + } geCount; + + struct { + bool yes; + } geneFull; + + } quant; + + //variation parameters + struct { + bool yes; + string vcfFile; + } var; + + struct { + bool yes; + bool SAMtag; + string outputMode; + } wasp; + + //solo + ParametersSolo pSolo; + + //chimeric + ParametersChimeric pCh; + + //splitting + char Qsplit; + uint maxNsplit, minLsplit, minLmap; + + //not really parameters, but global variables: + array,2> sjAll; + uint64 sjNovelN, *sjNovelStart, *sjNovelEnd; //novel junctions collapased and filtered + + ////////////////////// CLEAN-UP needed + InOutStreams *inOut; //main input output streams + + uint Lread; + + Parameters(); + ~Parameters(); + int readParsFromFile(ifstream*, ofstream*, int); //read parameters from one file + int readPars(); // read parameters from all files + int scanOneLine (string &lineIn, int inputLevel, int inputLevelRequested); + void scanAllLines (istream &streamIn, int inputLevel, int inputLevelRequested); + void inputParameters (int argInN, char* argIn[]); //input parameters: default, from files, from command line + void openReadsFiles(); + void closeReadsFiles(); + void readSAMheader(const string readFilesCommandString, const vector readFilesNames); + +}; +#endif // Parameters.h diff --git a/star-sys/STAR/source/ParametersChimeric.h b/star-sys/STAR/source/ParametersChimeric.h new file mode 100644 index 0000000..2f4602b --- /dev/null +++ b/star-sys/STAR/source/ParametersChimeric.h @@ -0,0 +1,41 @@ +#ifndef CODE_ParametersChimeric +#define CODE_ParametersChimeric + +#include "IncludeDefine.h" + +class Parameters; + +class ParametersChimeric +{// + public: + uint segmentMin, junctionOverhangMin; //min chimeric donor/acceptor length + uint segmentReadGapMax; //max read gap for stitching chimeric windows + int scoreMin,scoreDropMax,scoreSeparation, scoreJunctionNonGTAG; //min chimeric score + uint mainSegmentMultNmax; + + uint multimapScoreRange, multimapNmax, nonchimScoreDropMin; + + vector outJunctionFormat; + + struct + { + vector stringIn; + bool genomicN; + } filter; + + struct + { + vector type; + bool bam; + bool bamHardClip; + bool samOld; + bool junctions; + } out; + + void initialize(Parameters *pPin); + + private: + Parameters *pP; +}; + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/ParametersChimeric_initialize.cpp b/star-sys/STAR/source/ParametersChimeric_initialize.cpp new file mode 100644 index 0000000..5838a58 --- /dev/null +++ b/star-sys/STAR/source/ParametersChimeric_initialize.cpp @@ -0,0 +1,114 @@ +#include "ParametersChimeric.h" +#include "Parameters.h" +#include "ErrorWarning.h" +#include "GlobalVariables.h" + +void ParametersChimeric::initialize(Parameters *pPin) +{ + if (segmentMin==0) + return; + + pP=pPin; + + if (out.samOld) { + pP->inOut->outChimSAM.open((pP->outFileNamePrefix + "Chimeric.out.sam").c_str()); + pP->inOut->outChimSAM << pP->samHeader; + }; + + out.bam=false; + out.junctions=false; + out.samOld=false; + out.bamHardClip=true;//default + for (const auto& type1 : out.type) { + if (type1=="WithinBAM") { + out.bam=true; + } else if (type1=="SeparateSAMold") { + out.samOld=true; + } else if (type1=="Junctions") { + out.junctions=true; + } else if (type1=="HardClip") { + out.bamHardClip=true; + } else if (type1=="SoftClip") { + out.bamHardClip=false; + } else { + ostringstream errOut; + errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for --chimOutType: "<inOut->logMain, EXIT_CODE_PARAMETER, *pP); + }; + }; + + if (out.junctions) { + pP->inOut->outChimJunction.open((pP->outFileNamePrefix + "Chimeric.out.junction").c_str()); + + if (multimapNmax>0) + // column headers for Chimeric.out.junction file + pP->inOut->outChimJunction << + "chr_donorA" <<"\t"<< + "brkpt_donorA" <<"\t"<< + "strand_donorA" <<"\t"<< + "chr_acceptorB" <<"\t"<< + "brkpt_acceptorB" <<"\t"<< + "strand_acceptorB" <<"\t"<< + "junction_type" <<"\t"<< + "repeat_left_lenA" <<"\t"<< + "repeat_right_lenB" <<"\t"<< + "read_name" <<"\t"<< + "start_alnA" <<"\t"<< + "cigar_alnA" <<"\t"<< + "start_alnB" <<"\t"<< + "cigar_alnB" <<"\t"<< + "num_chim_aln" <<"\t"<< + "max_poss_aln_score" <<"\t"<< + "non_chim_aln_score" <<"\t"<< + "this_chim_aln_score" <<"\t"<< + "bestall_chim_aln_score" <<"\t"<< + "PEmerged_bool" <<"\t"<< + "readgrp" <<"\n"; + }; + + + + if (out.bam && !pP->outBAMunsorted && !pP->outBAMcoord) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETERS error: --chimOutType WithinBAM requires BAM output\n"; + errOut <<"SOLUTION: re-run with --outSAMtype BAM Unsorted/SortedByCoordinate\n"; + exitWithError(errOut.str(), std::cerr, pP->inOut->logMain, EXIT_CODE_PARAMETER, *pP); + }; + + if (multimapNmax>0 && (out.bam || out.samOld)) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETERS error: --chimMultimapNmax > 0 (new chimeric detection) presently only works with --chimOutType Junctions\n"; + errOut <<"SOLUTION: re-run with --chimOutType Junctions\n"; + exitWithError(errOut.str(), std::cerr, pP->inOut->logMain, EXIT_CODE_PARAMETER, *pP); + }; + + if (pP->peOverlap.NbasesMin > 0) { + if (multimapNmax == 0 && (out.junctions || out.samOld)) { + ostringstream errOut; + errOut <<"EXITING because of fatal PARAMETERS error: --chimMultimapNmax 0 (default old chimeric detection) and --peOverlapNbasesMin > 0 (merging ovelrapping mates) presently only works with --chimOutType WithinBAM\n"; + errOut <<"SOLUTION: re-run with --chimOutType WithinBAM\n"; + exitWithError(errOut.str(), std::cerr, pP->inOut->logMain, EXIT_CODE_PARAMETER, *pP); + }; + }; + + if (out.bam && !pP->outSAMattrPresent.NM) { + pP->outSAMattrOrder.push_back(ATTR_NM); + pP->inOut->logMain << "WARNING --chimOutType=WithinBAM, therefore STAR will output NM attribute" <inOut->logMain, EXIT_CODE_PARAMETER, *pP); + }; + }; +}; diff --git a/star-sys/STAR/source/ParametersGenome.h b/star-sys/STAR/source/ParametersGenome.h new file mode 100644 index 0000000..905e7e1 --- /dev/null +++ b/star-sys/STAR/source/ParametersGenome.h @@ -0,0 +1,34 @@ +#ifndef CODE_ParametersGenome +#define CODE_ParametersGenome + +class ParametersGenome {//"constant" genome parameters - user input + public: + string gDir; + string gLoad; + vector gFastaFiles; + vector gChainFiles; + string gConsensusFile; + + uint gSAindexNbases;//length of the SA pre-index strings + uint gChrBinNbits; + uint gSAsparseD;//SA sparsity + uint gSuffixLengthMax;//maximum length of the suffixes, has to be longer than read length + vector gFileSizes;//size of the genome files + + vector sjdbFileChrStartEnd; + string sjdbGTFfile; + string sjdbGTFchrPrefix; + + string sjdbGTFfeatureExon; + string sjdbGTFtagExonParentTranscript; + string sjdbGTFtagExonParentGene; + vector sjdbGTFtagExonParentGeneName; + vector sjdbGTFtagExonParentGeneType; + + string sjdbInsertSave; + uint sjdbOverhang; + int sjdbOverhang_par; + int sjdbScore; +}; + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/ParametersSolo.cpp b/star-sys/STAR/source/ParametersSolo.cpp new file mode 100755 index 0000000..c73842f --- /dev/null +++ b/star-sys/STAR/source/ParametersSolo.cpp @@ -0,0 +1,163 @@ +#include "ParametersSolo.h" +#include "Parameters.h" +#include "ErrorWarning.h" +#include "streamFuns.h" +#include "SequenceFuns.h" +#include "serviceFuns.cpp" + +#include + +const vector ParametersSolo::featureNames={"Gene","SJ","GeneFull"}; + +void ParametersSolo::initialize(Parameters *pPin) +{ + pP=pPin; + + if (typeStr=="None") { + type=0; + return; + } else if (typeStr=="Droplet") { + if (umiL > 16) { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: UMI length is too long: --soloUMIlen="< 31) { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: CB length is too long: --soloCBlen="<readNmates=1; //output mates TODO: check that readNmatesIn==2 + } else { + ostringstream errOut; + errOut << "EXITING because of fatal PARAMETERS error: unrecognized option in --soloType="<outFileNamePrefix+outFileNames[0].substr(0,outFileNames[0].find_last_of("/")); + if (mkdir(dir1.c_str(),pP->runDirPerm)!=0 && errno!=EEXIST) { + ostringstream errOut; + errOut << "EXITING because of fatal OUTPUT FILE error: could not create Solo output directory"<> seq1) { + if (seq1.size() != cbL) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in input CB whitelist file: "<< soloCBwhitelist <<" the total length of barcode sequence is " << seq1.size() << " not equal to expected " <inOut->logMain << "WARNING: CB whitelist sequence contains non-ACGT and is ignored: " << seq1 <); + + if (!pP->quant.trSAM.yes) { + printf("warning: trying to set quant.trSam variables but Parameters is const\n"); + //pP->quant.yes = true; + //pP->quant.trSAM.yes = true; + //pP->quant.trSAM.bamYes = false; + //pP->quant.trSAM.bamCompression = -2; + //pP->quant.trSAM.indel = true; + //pP->quant.trSAM.softClip = true; + //pP->inOut->logMain << "Turning on Genomic->Transcriptomic coordinate conversion for STARsolo\n"; + }; + + if (featureYes[2]) + printf("warning: trying to set quant.geneFull.yes but Parameters is const\n"); + // pP->quant.geneFull.yes=true; + + time_t rawTime; + time(&rawTime); + //pP->inOut->logMain << timeMonthDayTime(rawTime) << "Finished reading CB whitelist sequences: " << cbWL.size() < cbWL; + bool cbWLyes; + string strandStr; + int32 strand; + //features + const static vector featureNames; + vector featureIn; + vector features, featureInd; + uint32 nFeatures; + bool featureYes[3]; //which features are requested + //filtering + char QSbase,QSmax;//quality score base and cutoff + float cbMinP;//for CBs with non-exact matching to WL, min posterior probability + //algorithms + vector umiDedup; + vector umiDedupColumns; + vector umiDedupYes; + //output + vector outFileNames; + //constants + uint32 umiMaskLow, umiMaskHigh; //low/high half bit-mask or UMIs + + void initialize(Parameters *pPin); +private: + const Parameters *pP; +}; +#endif diff --git a/star-sys/STAR/source/Parameters_closeReadsFiles.cpp b/star-sys/STAR/source/Parameters_closeReadsFiles.cpp new file mode 100644 index 0000000..f3a22ab --- /dev/null +++ b/star-sys/STAR/source/Parameters_closeReadsFiles.cpp @@ -0,0 +1,12 @@ +#include "Parameters.h" +#include "ErrorWarning.h" +#include +#include +void Parameters::closeReadsFiles() { + for (uint imate=0; imatereadIn[imate].is_open() ) + inOut->readIn[imate].close(); + if (readFilesCommandPID[imate]>0) + kill(readFilesCommandPID[imate],SIGKILL); + }; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/Parameters_openReadsFiles.cpp b/star-sys/STAR/source/Parameters_openReadsFiles.cpp new file mode 100644 index 0000000..d26ca94 --- /dev/null +++ b/star-sys/STAR/source/Parameters_openReadsFiles.cpp @@ -0,0 +1,125 @@ +#include "Parameters.h" +#include "ErrorWarning.h" +#include +#include +void Parameters::openReadsFiles() { + string readFilesCommandString(""); + if (readFilesCommand.at(0)=="-") { + if (readFilesIn.at(0).find(',')readIn[ii].is_open() ) inOut->readIn[ii].close(); + + string rfName=(readFilesPrefix=="-" ? "" : readFilesPrefix)+readFilesIn.at(ii); + + inOut->readIn[ii].open(rfName.c_str()); //try to open the Sequences file right away, exit if failed + if (inOut->readIn[ii].fail()) { + ostringstream errOut; + errOut <<"EXITING because of fatal input ERROR: could not open readFilesIn=" << rfName <<"\n"; + exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this); + }; + }; + } else {//create fifo files, execute pre-processing command + + vector readsCommandFileName; + + for (uint imate=0;imatelogMain << "\n Input read files for mate "<< imate+1 <<", from input string " << readFilesIn.at(imate) < \""< "+ outFileTmp+"/readFilesIn.info 2>&1").c_str()); + ifstream readFilesIn_info((outFileTmp+"/readFilesIn.info").c_str()); + inOut->logMain <logMain <<"\n readsCommandsFile:\n"<logMain, EXIT_CODE_PARAMETER, *this); + break; + + case 0: + //this is the child + execlp(readsCommandFileName.at(imate).c_str(), readsCommandFileName.at(imate).c_str(), (char*) NULL); + exit(0); + + default: + //this is the father, record PID of the children + readFilesCommandPID[imate]=PID; + }; + +// system((("\""+readsCommandFileName.at(imate)+"\"") + " & ").c_str()); + inOut->readIn[imate].open(readFilesInTmp.at(imate).c_str()); + }; + if (readFilesIn.size()==2 && readFilesNames.at(0).size() != readFilesNames.at(1).size()) { + ostringstream errOut; + errOut <<"EXITING: because of fatal INPUT ERROR: number of input files for mate1: "<logMain, EXIT_CODE_PARAMETER, *this); + }; + + if (outSAMattrRG.size()>1 && outSAMattrRG.size()!=readFilesN) { + ostringstream errOut; + errOut <<"EXITING: because of fatal INPUT ERROR: number of input read files: "<logMain, EXIT_CODE_PARAMETER, *this); + } else if (outSAMattrRG.size()==1) {//use the same read group for all files + for (uint32 ifile=1;ifile +#include + +void Parameters::readSAMheader(const string readFilesCommandString, const vector readFilesNames) { + + if (readFilesCommandString=="") {//simply read from file + while (inOut->readIn[0].peek()=='@') { + string str1; + getline(inOut->readIn[0],str1); + if (str1.substr(1,2)!="HD" && str1.substr(1,2)!="SQ") { + samHeaderExtra += str1 + '\n'; + }; + }; + return; + }; + + string tmpFifo=outFileTmp+"tmp.fifo.header"; + remove(tmpFifo.c_str()); + mkfifo(tmpFifo.c_str(), S_IRUSR | S_IWUSR ); + + ifstream tmpFifoIn; + for (uint32 ii=0; ii " + tmpFifo + "&"; + system(com1.c_str()); + tmpFifoIn.open(tmpFifo); + while (tmpFifoIn.peek()=='@') { + string str1; + getline(tmpFifoIn,str1); + if (str1.substr(1,2)!="HD" && str1.substr(1,2)!="SQ" && (!twoPass.pass2) ) { + //SQ and HD header lines cannot be imported from uSAM; do not record the header again in the 2nd pass + samHeaderExtra += str1 + '\n'; + }; + }; + tmpFifoIn.close(); + }; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/Quantifications.cpp b/star-sys/STAR/source/Quantifications.cpp new file mode 100644 index 0000000..efc6ee0 --- /dev/null +++ b/star-sys/STAR/source/Quantifications.cpp @@ -0,0 +1,37 @@ +#include "Quantifications.h" + +Quantifications::Quantifications (uint32 nGeIn) { + + geneCounts.nType=3; + geneCounts.cAmbig = new uintQ[geneCounts.nType]; + geneCounts.cNone = new uintQ[geneCounts.nType]; + + geneCounts.nGe=nGeIn; + geneCounts.gCount = new uintQ* [geneCounts.nType]; + + geneCounts.cMulti=0; + for (int itype=0; itype (0.0, 1.0); + //transcriptome + if ( P.quant.trSAM.yes ) { + alignTrAll=new Transcript [P.alignTranscriptsPerReadNmax]; + }; + //split + splitR=new uint*[3]; + splitR[0]=new uint[P.maxNsplit]; splitR[1]=new uint[P.maxNsplit]; splitR[2]=new uint[P.maxNsplit]; + //alignments + PC=new uiPC[P.seedPerReadNmax]; + WC=new uiWC[P.alignWindowsPerReadNmax]; + nWA=new uint[P.alignWindowsPerReadNmax]; + nWAP=new uint[P.alignWindowsPerReadNmax]; + WALrec=new uint[P.alignWindowsPerReadNmax]; + WlastAnchor=new uint[P.alignWindowsPerReadNmax]; + +#ifdef COMPILE_FOR_LONG_READS + swWinCov = new uint[P.alignWindowsPerReadNmax]; + scoreSeedToSeed = new intScore [P.seedPerWindowNmax*(P.seedPerWindowNmax+1)/2]; + scoreSeedBest = new intScore [P.seedPerWindowNmax]; + scoreSeedBestInd = new uint [P.seedPerWindowNmax]; + scoreSeedBestMM = new uint [P.seedPerWindowNmax]; + seedChain = new uint [P.seedPerWindowNmax]; +#endif + + WA=new uiWA*[P.alignWindowsPerReadNmax]; + for (uint ii=0;ii +#include + +class ReadAlign { + public: + //methods + ReadAlign (const Parameters& Pin, const Genome &genomeIn, Transcriptome *TrIn, int iChunk);//allocate arrays + int oneRead(); + + //vars + const Genome &mapGen; //mapped-to-genome structure + + uint iRead; + + // input data as fastq strings + const char *readFastq[2]; + + istream* readInStream[MAX_N_MATES];fstream chunkOutChimSAM, *chunkOutChimJunction, chunkOutUnmappedReadsStream[MAX_N_MATES], chunkOutFilterBySJoutFiles[MAX_N_MATES]; + + + ostream* outSAMstream; + uint outBAMbytes; //number of bytes output to SAM/BAM with oneRead + char *outBAMarray;//pointer to the (last+1) position of the SAM/BAM output array + + uint outFilterMismatchNmaxTotal; + uint Lread, readLength[MAX_N_MATES], readLengthOriginal[MAX_N_MATES], readLengthPair, readLengthPairOriginal; + intScore maxScoreMate[MAX_N_MATES]; + + uint readFilesIndex; + + ReadAlign *waspRA; //ReadAlign for alternative WASP alignment + int waspType, waspType1; //alignment ASE-WASP type and + + ReadAlign *peMergeRA; //ReadAlign for merged PE mates + + uint readNmates; + char **Read0; + char **Qual0; + char **Read1; + char **Qual1; + char **readNameMates; + char *readName; + void multMapSelect(); + int mapOneRead(); + const char* outputAlignments(); + void resetN(); + private: + const Parameters& P; //pointer to the parameters, will be initialized on construction + + //quantification + Transcriptome *chunkTr; + + //mapping time + time_t timeStart, timeFinish; + + //random number generators + std::mt19937 rngMultOrder;//initialize in ReadAlign.cpp + std::uniform_real_distribution rngUniformReal0to1;//initialize in ReadAlign.cpp + + //input,output + + char** outBAMoneAlign; + uint* outBAMoneAlignNbytes; + + ostringstream samStreamCIGAR, samStreamSJmotif, samStreamSJintron; + vector matesCIGAR; + + intScore *scoreSeedToSeed, *scoreSeedBest; + uint *scoreSeedBestInd, *seedChain, *scoreSeedBestMM; + + bool outFilterPassed; //true if alignment passed all filter and is output to SAM/BAM + +// StatsAll *statsRA; + + //transcript + Transcript* trArray; //linear array of transcripts to store all of them from all windows + Transcript** trArrayPointer; //linear array of transcripts to store all of them from all windows + + //read + uint iReadAll, iMate; + char readFilter; //Illumina not passed Y/N + bool revertStrand; //what to do with the strand, according to strandType and iMate + uint clip3pNtotal[MAX_N_MATES], clip5pNtotal[MAX_N_MATES], clip3pAdapterN[MAX_N_MATES]; //total number of trimmed bases from 5p,3p + int readFileType; //file type: 1=fasta; 2=fastq + + vectorreadNameExtra; + + char dummyChar[4096]; + //char** Read0; + //char** Qual0; + //char** readNameMates; + //char* readName; + //char** Qual1; //modified QSs for scoring + + //uint readNmates; + //split + uint** splitR; + uint Nsplit; + +// uint fragLength[MAX_N_FRAG], fragStart[MAX_N_FRAG]; //fragment Lengths and Starts in read space + + //binned alignments + uintWinBin **winBin; //binned genome: window ID (number) per bin + + //alignments + uiPC *PC; //pieces coordinates + uiWC *WC; //windows coordinates + uiWA **WA; //aligments per window + + int unmapType; //marker for why a read is unmapped + + uint mapMarker; //alignment marker (typically, if there is something wrong) + uint nA, nP, nW, nWall, nUM[2]; //number of all alignments, pieces, windows, U/M, + uint *nWA, *nWAP, *WALrec, *WlastAnchor; //number of alignments per window, per window per piece, min recordable length per window + bool *WAincl; //alginment inclusion mask + + uint *swWinCov, *swWinGleft, *swWinGright, *swWinRleft, *swWinRright; //read coverage per window + char *swT; + + uint storedLmin, uniqLmax, uniqLmaxInd, multLmax, multLmaxN, multNmin, multNminL, multNmax, multNmaxL; + uint nTr, nTrMate; // number of transcripts called + intScore maxScore;//maximum alignment score + + Transcript trA, trA1, *trBest, *trInit; //transcript, best tr, next best tr, initialized tr + Transcript ***trAll; //all transcripts for all windows + uint *nWinTr; //number of recorded transcripts per window + + //old chimeric detection + uint chimN, chimRepeat, chimStr; + int chimMotif; + uint chimRepeat0, chimRepeat1, chimJ0, chimJ1; + Transcript trChim[MAX_N_CHIMERAS]; + //new chimeric detection + bool chimRecord; //true if chimeric aligment was detected + + Transcript *alignC, *extendC, *polyAtailC; //alignment rules/conditions + + Transcript* trMult[MAX_N_MULTMAP];//multimapping transcripts + Transcript *alignTrAll;//alignments to transcriptome + + struct { + bool yes; + uint nOv;//number of overlapping bases + uint ovS;//first read base of the overlap + uint mateStart[2];//mates starts in the merged read + } peOv;//PE mates overlap/merge/remap structure + + //void resetN();//resets the counters to 0 + //void multMapSelect(); + //int mapOneRead(); + uint maxMappableLength2strands(uint pieceStart, uint pieceLength, uint iDir, uint iSA1, uint iSA2, uint& maxL, uint iFrag); + void storeAligns (uint iDir, uint Shift, uint Nrep, uint L, uint indStartEnd[2], uint iFrag); + + bool outputTranscript(Transcript *trOut, uint nTrOut, ofstream *outBED); + uint outputTranscriptSAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint mateChr, uint mateStart, char mateStrand, int unmapType, bool *mateMapped, ostream *outStream); + int alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint trChrStart, uint mateChr, uint mateStart, char mateStrand, int unmapType, bool *mateMapped, vector outSAMattrOrder, char** outBAMarray, uint* outBAMarrayN); + void samAttrNM_MD (Transcript const &trOut, uint iEx1, uint iEx2, uint &tagNM, string &tagMD); + + string outputTranscriptCIGARp(Transcript const &trOut); + int createExtendWindowsWithAlign(uint a1, uint aStr); //extends and windows with one alignment + void assignAlignToWindow(uint a1, uint aLength, uint aStr, uint aNrep, uint aFrag, uint aRstart,bool aAnchor, uint sjA); //assigns one alignment to a window + + void mappedFilter(); + void chimericDetection(); + bool chimericDetectionOld(); + void chimericDetectionOldOutput(); + bool chimericDetectionMult(); + void chimericDetectionPEmerged(ReadAlign &seRa); +// void chimericDetectionPEmergedTrim(); + + //string outputAlignments(); + void calcCIGAR(Transcript const &trOut, uint nMates, uint iExMate, uint leftMate); + + void stitchWindowSeeds (uint iW, uint iWrec, bool *WAexcl, char *R);//stitches all seeds in one window: iW + void stitchPieces(char **R, uint Lread); + + uint quantTranscriptome (Transcriptome *Tr, uint nAlignG, Transcript **alignG, Transcript *alignT, vector &readTranscripts, set &readTrGenes); + + void copyRead(ReadAlign&); + void peOverlapMergeMap(); + void peMergeMates(); + void peOverlapSEtoPE(ReadAlign &seRA); + + +}; + +#endif + + diff --git a/star-sys/STAR/source/ReadAlignChunk.cpp b/star-sys/STAR/source/ReadAlignChunk.cpp new file mode 100644 index 0000000..30d3e5f --- /dev/null +++ b/star-sys/STAR/source/ReadAlignChunk.cpp @@ -0,0 +1,110 @@ +#include +#include + +#include "ReadAlignChunk.h" +#include +#include "ErrorWarning.h" + +using std::strstreambuf; +using std::istream; + +ReadAlignChunk::ReadAlignChunk(Parameters& Pin, Genome &genomeIn, Transcriptome *TrIn, int iChunk) : P(Pin), mapGen(genomeIn) {//initialize chunk + + iThread=iChunk; + + if ( P.quant.yes ) {//allocate transcriptome structures + chunkTr=new Transcriptome(*TrIn); + chunkTr->quantsAllocate(); + } else { + chunkTr=NULL; + }; + + RA = new ReadAlign(P, mapGen, chunkTr, iChunk);//new local copy of RA for each chunk + + RA->iRead=0; + + chunkIn=new char* [P.readNmates]; + readInStream=new istream* [P.readNmates]; +// readInStream=new istringstream* [P.readNmates]; + for (uint ii=0;iireadInStream[ii]=readInStream[ii]; + }; + + + if (P.outSAMbool) { + chunkOutBAM=new char [P.chunkOutBAMsizeBytes]; + RA->outBAMarray=chunkOutBAM; + strstreambuf *buf = new strstreambuf(chunkOutBAM,P.chunkOutBAMsizeBytes,chunkOutBAM); + chunkOutBAMstream=new ostream(buf); + RA->outSAMstream=chunkOutBAMstream; + chunkOutBAMtotal=0; + }; + + if (P.wasp.yes) { + RA->waspRA= new ReadAlign(Pin,genomeIn,TrIn,iChunk); + }; + if (P.peOverlap.yes) { + RA->peMergeRA= new ReadAlign(Pin,genomeIn,TrIn,iChunk); + }; +}; + +/////////////// +void ReadAlignChunk::chunkFstreamOpen(string filePrefix, int iChunk, fstream &fstreamOut) {//open fstreams for chunks + ostringstream fNameStream1; + fNameStream1 << filePrefix << iChunk; + string fName1=fNameStream1.str(); + P.inOut->logMain << "Opening the file: " << fName1 << " ... " <logMain << "failed!\n"; + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: could not create output file "<< fName1 << "\n"; + errOut << "Solution: check that you have permission to write this file\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + P.inOut->logMain << "ok" <flush(); + allOut->clear(); + fileChunkIn.close(); + fileChunkIn.clear(); + remove(name1.str().c_str()); + iC++; + } else { + fileChunkIn.close(); + break; + }; + }; +}; + diff --git a/star-sys/STAR/source/ReadAlignChunk.h b/star-sys/STAR/source/ReadAlignChunk.h new file mode 100644 index 0000000..c8c80a1 --- /dev/null +++ b/star-sys/STAR/source/ReadAlignChunk.h @@ -0,0 +1,40 @@ +#ifndef CODE_ReadAlignChunk +#define CODE_ReadAlignChunk + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "ReadAlign.h" +#include "Transcriptome.h" + +class ReadAlignChunk {//chunk of reads and alignments +public: + Parameters& P; + ReadAlign* RA; + + Transcriptome *chunkTr; + + char **chunkIn; //space for the chunk of input reads + char *chunkOutBAM, *chunkOutBAM1;//space for the chunk of output SAM + + istream** readInStream; + ostream* chunkOutBAMstream; + ofstream chunkOutBAMfile; + string chunkOutBAMfileName; + + bool noReadsLeft; + uint iChunkIn; //current chunk # as read from .fastq + uint iChunkOutSAM; //current chunk # writtedn to Aligned.out.sam + int iThread; //current thread + uint chunkOutBAMtotal; //total number of bytes in the write buffer + + ReadAlignChunk(Parameters& Pin, Genome &genomeIn, Transcriptome *TrIn, int iChunk); + void processChunks(); + void mapChunk(); + void chunkFstreamOpen(string filePrefix, int iChunk, fstream &fstreamOut); + void chunkFstreamCat (fstream &chunkOut, ofstream &allOut, bool mutexFlag, pthread_mutex_t &mutexVal); + void chunkFilesCat(ostream *allOut, string filePrefix, uint &iC); + + Genome &mapGen; +private: +}; +#endif diff --git a/star-sys/STAR/source/ReadAlignChunk_mapChunk.cpp b/star-sys/STAR/source/ReadAlignChunk_mapChunk.cpp new file mode 100644 index 0000000..3e81c3a --- /dev/null +++ b/star-sys/STAR/source/ReadAlignChunk_mapChunk.cpp @@ -0,0 +1,107 @@ +#include "ReadAlignChunk.h" +#include "GlobalVariables.h" +#include "ThreadControl.h" +#include "ErrorWarning.h" +#include SAMTOOLS_BGZF_H + +void ReadAlignChunk::mapChunk() {//map one chunk. Input reads stream has to be setup in RA->readInStream[ii] + RA->statsRA.resetN(); + + for (uint ii=0;iireadInStream[ii]->clear(); + RA->readInStream[ii]->seekg(0,ios::beg); + }; + + if ( P.outSAMorder == "PairedKeepInputOrder" && P.runThreadN>1 ) {//open chunk file + ostringstream name1(""); + name1 << P.outFileTmp + "/Aligned.tmp.sam.chunk"<oneRead(); //map one read + + if (readStatus==0) {//there was a read processed + RA->iRead++; +// chunkOutBAMtotal=(uint) RA->outSAMstream->tellp(); + chunkOutBAMtotal+=RA->outBAMbytes; +// uint ddd=(uint) RA->outSAMstream->tellp(); + }; + + //write SAM aligns to chunk buffer + if (P.outSAMbool) { + if ( chunkOutBAMtotal > P.chunkOutBAMsizeBytes ) {//this should not happen! + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SAM/BAM output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSAMoneReadBytes\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + } else if ( chunkOutBAMtotal + P.limitOutSAMoneReadBytes > P.chunkOutBAMsizeBytes || (readStatus==-1 && noReadsLeft) ) {//write buffer to disk because it's almost full, or all reads are mapped + if ( P.outSAMorder == "PairedKeepInputOrder" && P.runThreadN>1 ) {//output chunks into separate files + chunkOutBAMfile.write(chunkOutBAM,chunkOutBAMtotal); + chunkOutBAMfile.clear(); //in case 0 bytes were written which could set fail bit + } else {//standard way, directly into Aligned.out.sam file + //SAM output + if (P.runThreadN>1) pthread_mutex_lock(&g_threadChunks.mutexOutSAM); + P.inOut->outSAM->write(chunkOutBAM,chunkOutBAMtotal); + P.inOut->outSAM->clear();//in case 0 bytes were written which could set fail bit + if (P.runThreadN>1) pthread_mutex_unlock(&g_threadChunks.mutexOutSAM); + }; + RA->outSAMstream->seekp(0,ios::beg); //rewind the chunk storage + chunkOutBAMtotal=0; + }; + }; + + //collapse SJ buffer if needed + if ( chunkOutSJ->N > P.limitOutSJcollapsed ) {//this means the number of collapsed junctions is larger than the chunks size + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SJ output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSJoneRead\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + } else if ( chunkOutSJ->N + P.limitOutSJoneRead > P.limitOutSJcollapsed || (readStatus==-1 && noReadsLeft) ) {//write buffer to disk because it's almost full, or all reads are mapped + chunkOutSJ->collapseSJ(); + if ( chunkOutSJ->N + 2*P.limitOutSJoneRead > P.limitOutSJcollapsed ) { + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SJ output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSJcollapsed\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + + //collapse SJ1 buffer if needed + if ( chunkOutSJ1->N > P.limitOutSJcollapsed ) {//this means the number of collapsed junctions is larger than the chunks size + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SJ output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSJoneRead\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + } else if ( chunkOutSJ1->N + P.limitOutSJoneRead > P.limitOutSJcollapsed || (readStatus==-1 && noReadsLeft) ) {//write buffer to disk because it's almost full, or all reads are mapped + chunkOutSJ1->collapseSJ(); + if ( chunkOutSJ1->N + 2*P.limitOutSJoneRead > P.limitOutSJcollapsed ) { + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SJ output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSJcollapsed\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + + }; //reads cycle + + if ( P.outSAMbool && P.outSAMorder == "PairedKeepInputOrder" && P.runThreadN>1 ) {//write the remaining part of the buffer, close and rename chunk files + chunkOutBAMfile.write(chunkOutBAM,chunkOutBAMtotal); + chunkOutBAMfile.clear(); //in case 0 bytes were written which could set fail bit + chunkOutBAMfile.close(); + RA->outSAMstream->seekp(0,ios::beg); //rewind the chunk storage + chunkOutBAMtotal=0; + ostringstream name2(""); + name2 << P.outFileTmp + "/Aligned.out.sam.chunk"<1) pthread_mutex_lock(&g_threadChunks.mutexStats); + g_statsAll.addStats(RA->statsRA); + g_statsAll.progressReport(P.inOut->logProgress); + if (P.runThreadN>1) pthread_mutex_unlock(&g_threadChunks.mutexStats); +}; diff --git a/star-sys/STAR/source/ReadAlignChunk_processChunks.cpp b/star-sys/STAR/source/ReadAlignChunk_processChunks.cpp new file mode 100755 index 0000000..8bf0fe2 --- /dev/null +++ b/star-sys/STAR/source/ReadAlignChunk_processChunks.cpp @@ -0,0 +1,247 @@ +#include "ReadAlignChunk.h" +#include "GlobalVariables.h" +#include "ThreadControl.h" +#include "ErrorWarning.h" +#include "SequenceFuns.h" + +void ReadAlignChunk::processChunks() {//read-map-write chunks + noReadsLeft=false; //true if there no more reads left in the file + bool newFile=false; //new file marker in the input stream + while (!noReadsLeft) {//continue until the input EOF + //////////////read a chunk from input files and store in memory + if (P.outFilterBySJoutStage<2) {//read chunks from input file + + if (P.runThreadN>1) pthread_mutex_lock(&g_threadChunks.mutexInRead); + + uint chunkInSizeBytesTotal[2]={0,0}; + while (chunkInSizeBytesTotal[0] < P.chunkInSizeBytes && chunkInSizeBytesTotal[1] < P.chunkInSizeBytes && P.inOut->readIn[0].good() && P.inOut->readIn[1].good()) { + char nextChar=P.inOut->readIn[0].peek(); + if (P.iReadAll==P.readMapNumber) {//do not read any more reads + break; + } else if (P.readFilesTypeN==10 && P.inOut->readIn[0].good() && P.outFilterBySJoutStage!=2) {//SAM input && not eof && not 2nd stage + + string str1; + + if (nextChar=='@') {//with SAM input linest that start with @ are headers + getline(P.inOut->readIn[0], str1); //read line and skip it + continue; + }; + + P.inOut->readIn[0] >> str1; + if (str1=="FILE") { + newFile=true; + } else { + P.iReadAll++; //increment read number + + uint imate1=0; + for (uint imate=0;imate0) + P.inOut->readIn[0] >> str1; //for imate=0 str1 was already read + uint flag; + P.inOut->readIn[0] >>flag; //read name and flag + char passFilterIllumina=(flag & 0x800 ? 'Y' : 'N'); + + if (imate==1) {//2nd line is always opposite of the 1st one + imate1=1-imate1; + } else if (P.readNmates==2 && (flag & 0x80)) { + imate1=1; + } else { + imate1=0; + }; + + //read ID or number + if (P.outSAMreadID=="Number") { + chunkInSizeBytesTotal[imate1] += sprintf(chunkIn[imate1] + chunkInSizeBytesTotal[imate1], "@%llu", P.iReadAll); + } else { + chunkInSizeBytesTotal[imate1] += sprintf(chunkIn[imate1] + chunkInSizeBytesTotal[imate1], "@%s", str1.c_str()); + }; + + //iReadAll, passFilterIllumina, passFilterIllumina + chunkInSizeBytesTotal[imate1] += sprintf(chunkIn[imate1] + chunkInSizeBytesTotal[imate1], " %llu %c %i", P.iReadAll, passFilterIllumina, P.readFilesIndex); + + for (int ii=3; ii<=9; ii++) + P.inOut->readIn[0] >> str1; //skip fields until sequence + + string seq1,qual1; + P.inOut->readIn[0] >> seq1 >> qual1; + if (flag & 0x10) {//sequence reverse-coomplemented + revComplementNucleotides(seq1); + reverse(qual1.begin(),qual1.end()); + }; + + getline(P.inOut->readIn[0],str1); //str1 is now all SAM attributes + chunkInSizeBytesTotal[imate1] += sprintf(chunkIn[imate1] + chunkInSizeBytesTotal[imate1], "%s\n%s\n+\n%s\n", str1.c_str(), seq1.c_str(), qual1.c_str()); + }; + }; + } else if (nextChar=='@') {//fastq, not multi-line + P.iReadAll++; //increment read number + if (P.outFilterBySJoutStage!=2) {//not the 2nd stage of the 2-stage mapping, read ID from the 1st read + string readID; + P.inOut->readIn[0] >> readID; + if (P.outSAMreadIDnumber) { + readID="@"+to_string(P.iReadAll); + }; + //read the second field of the read name line + char passFilterIllumina='N'; + if (P.inOut->readIn[0].peek()!='\n') {//2nd field exists + string field2; + P.inOut->readIn[0] >> field2; + if (field2.length()>=3 && field2.at(1)==':' && field2.at(2)=='Y' && field2.at(3)==':' ) + passFilterIllumina='Y'; + }; + readID += ' '+ to_string(P.iReadAll)+' '+passFilterIllumina+' '+to_string(P.readFilesIndex); + + //ignore the rest of the read name for both mates + for (uint imate=0; imatereadIn[imate].ignore(DEF_readNameSeqLengthMax,'\n'); + + if (P.pSolo.type==1) {//record barcode sequence + string seq1; + getline(P.inOut->readIn[1],seq1); + if (seq1.size() != P.pSolo.bL && P.pSolo.bL > 0) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in input read file: the total length of barcode sequence is " << seq1.size() << " not equal to expected " <logMain, EXIT_CODE_INPUT_FILES, P); + }; + readID += ' ' + seq1; + P.inOut->readIn[1].ignore(DEF_readNameSeqLengthMax,'\n');//skip to the end of 3rd ("+") line + getline(P.inOut->readIn[1],seq1); //read qualities + readID += ' ' + seq1; + }; + + //copy the same readID to both mates + for (uint imate=0; imatereadIn[imate].getline(chunkIn[imate] + chunkInSizeBytesTotal[imate], DEF_readNameSeqLengthMax+1 ); + chunkInSizeBytesTotal[imate] += P.inOut->readIn[imate].gcount(); + chunkIn[imate][chunkInSizeBytesTotal[imate]-1]='\n'; + }; + }; + } else if (nextChar=='>') {//fasta, can be multiline, which is converted to single line + P.iReadAll++; //increment read number + for (uint imate=0; imate%llu", P.iReadAll); + } else { + P.inOut->readIn[imate] >> (chunkIn[imate] + chunkInSizeBytesTotal[imate]); + chunkInSizeBytesTotal[imate] += strlen(chunkIn[imate] + chunkInSizeBytesTotal[imate]); + }; + + P.inOut->readIn[imate].ignore(DEF_readNameSeqLengthMax,'\n'); + + chunkInSizeBytesTotal[imate] += sprintf(chunkIn[imate] + chunkInSizeBytesTotal[imate], " %llu %c %i \n", P.iReadAll, 'N', P.readFilesIndex); + + + }; +// else {//2nd stage of 2-stage mapping +// read index and file index are already recorded with the read name, simply copy it +// P.inOut->readIn[imate].getline(chunkIn[imate] + chunkInSizeBytesTotal[imate], DEF_readNameSeqLengthMax+1 ); +// }; + nextChar=P.inOut->readIn[imate].peek(); + while (nextChar!='@' && nextChar!='>' && nextChar!=' ' && nextChar!='\n' && P.inOut->readIn[imate].good()) {//read multi-line fasta + P.inOut->readIn[imate].getline(chunkIn[imate] + chunkInSizeBytesTotal[imate], DEF_readSeqLengthMax + 1 ); + if (P.inOut->readIn[imate].gcount()<2) break; //no more input + chunkInSizeBytesTotal[imate] += P.inOut->readIn[imate].gcount()-1; + nextChar=P.inOut->readIn[imate].peek(); + }; + chunkIn[imate][chunkInSizeBytesTotal[imate]]='\n'; + chunkInSizeBytesTotal[imate] ++; + }; + } else if (nextChar==' ' || nextChar=='\n' || !P.inOut->readIn[0].good()) {//end of stream + P.inOut->logMain << "Thread #" < \n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + + if (newFile) { + P.inOut->readIn[0] >> P.readFilesIndex; + pthread_mutex_lock(&g_threadChunks.mutexLogMain); + P.inOut->logMain << "Starting to map file # " << P.readFilesIndex<<"\n"; + for (uint imate=0; imatelogMain << "mate " <readIn[imate].ignore(numeric_limits::max(),'\n'); + }; + P.inOut->logMain<1) pthread_mutex_unlock(&g_threadChunks.mutexInRead); + + } else {//read from one file per thread + noReadsLeft=true; + for (uint imate=0; imatechunkOutFilterBySJoutFiles[imate].flush(); + RA->chunkOutFilterBySJoutFiles[imate].seekg(0,ios::beg); + RA->readInStream[imate]=& RA->chunkOutFilterBySJoutFiles[imate]; + }; + }; + + mapChunk(); + + if (iThread==0 && P.runThreadN>1 && P.outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files + chunkFilesCat(P.inOut->outSAM, P.outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN); + }; + + };//cycle over input chunks + + if (P.outFilterBySJoutStage!=1 && RA->iRead>0) {//not the first stage of the 2-stage mapping + if (P.outBAMunsorted) chunkOutBAMunsorted->unsortedFlush(); + if (P.outBAMcoord) chunkOutBAMcoord->coordFlush(); + if (chunkOutBAMquant!=NULL) chunkOutBAMquant->unsortedFlush(); + + //the thread is finished mapping reads, concatenate the temp files into output files + if (P.pCh.segmentMin>0) { + chunkFstreamCat (RA->chunkOutChimSAM, P.inOut->outChimSAM, P.runThreadN>1, g_threadChunks.mutexOutChimSAM); + chunkFstreamCat (*RA->chunkOutChimJunction, P.inOut->outChimJunction, P.runThreadN>1, g_threadChunks.mutexOutChimJunction); + }; + if (P.outReadsUnmapped=="Fastx" ) { + if (P.runThreadN>1) + pthread_mutex_lock(&g_threadChunks.mutexOutUnmappedFastx); + + for (uint ii=0;iichunkOutUnmappedReadsStream[ii],P.inOut->outUnmappedReadsStream[ii], false, g_threadChunks.mutexOutUnmappedFastx); + }; + + if (P.runThreadN>1) + pthread_mutex_unlock(&g_threadChunks.mutexOutUnmappedFastx); + }; + }; + if (P.runThreadN>1) pthread_mutex_lock(&g_threadChunks.mutexLogMain); + P.inOut->logMain << "Completed: thread #" <1) pthread_mutex_unlock(&g_threadChunks.mutexLogMain); +}; + diff --git a/star-sys/STAR/source/ReadAlign_CIGAR.cpp b/star-sys/STAR/source/ReadAlign_CIGAR.cpp new file mode 100644 index 0000000..baf10d8 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_CIGAR.cpp @@ -0,0 +1,62 @@ +#include "ReadAlign.h" + +uint ReadAlign::alignCIGAR +samStreamCIGAR.str(std::string()); + + uint trimL; + if (Str==0 && Mate==0) { + trimL=clip5pNtotal[Mate]; + } else if (Str==0 && Mate==1) { + trimL=clip3pNtotal[Mate]; + } else if (Str==1 && Mate==0) { + trimL=clip3pNtotal[Mate]; + } else { + trimL=clip5pNtotal[Mate]; + }; + + uint trimL1 = trimL + trOut.exons[iEx1][EX_R] - (trOut.exons[iEx1][EX_R]0) { + samStreamCIGAR << trimL1 << "S"; //initial trimming + }; + + for (uint ii=iEx1;ii<=iEx2;ii++) { + if (ii>iEx1) {//record gaps + uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); + uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; + //it's possible to have a D or N and I at the same time + if (gapR>0){ + samStreamCIGAR << gapR; + samStreamCIGAR << "I"; + }; + if (trOut.canonSJ[ii-1]>=0 || trOut.sjAnnot[ii-1]==1) {//junction: N + samStreamCIGAR << gapG; + samStreamCIGAR << "N"; + samStreamSJmotif <<','<< trOut.canonSJ[ii-1] + (trOut.sjAnnot[ii-1]==0 ? 0 : SJ_SAM_AnnotatedMotifShift); //record junction type +// samStreamSJannot <<','<< (int) trOut.sjAnnot[ii-1]; //record annotation type + samStreamSJintron <<','<< trOut.exons[ii-1][EX_G] + trOut.exons[ii-1][EX_L] + 1 - P->chrStart[trOut.Chr] <<','\ + << trOut.exons[ii][EX_G] - P->chrStart[trOut.Chr]; //record intron loci + } else if (gapG>0) {//deletion: N + samStreamCIGAR << gapG; + samStreamCIGAR << "D"; + }; + }; + samStreamCIGAR << trOut.exons[ii][EX_L] << "M"; + }; + + string SJmotif = samStreamSJmotif.str(); + string SJintron = samStreamSJintron.str(); +// string SJannot = samStreamSJannot.str(); + + if (SJmotif.length()==0) {//no junctions recorded, mark with -1 + SJmotif=",-1"; + SJintron=",-1"; +// SJannot=",-1"; + }; + + uint trimR1=(trOut.exons[iEx1][EX_R] 0 ) { + samStreamCIGAR << trimR1 << "S"; //final trimming + }; + CIGAR=samStreamCIGAR.str(); \ No newline at end of file diff --git a/star-sys/STAR/source/ReadAlign_alignBAM.cpp b/star-sys/STAR/source/ReadAlign_alignBAM.cpp new file mode 100644 index 0000000..ab9b660 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_alignBAM.cpp @@ -0,0 +1,630 @@ +#include "ReadAlign.h" +#include "SequenceFuns.h" +#include "ErrorWarning.h" +#include "IncludeDefine.h" +#include // C++0x + +void ReadAlign::samAttrNM_MD (Transcript const &trOut, uint iEx1, uint iEx2, uint &tagNM, string &tagMD) { + tagNM=0; + tagMD=""; + char* R=Read1[trOut.roStr==0 ? 0:2]; + uint matchN=0; + for (uint iex=iEx1;iex<=iEx2;iex++) { + for (uint ii=0;ii>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); + if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); + if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); + if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); + if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); + return 0; +}; + +int bamAttrArrayWrite(const int32 attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='i'; + *( (int32*) (attrArray+3))=attr; + return 3+sizeof(int32); +}; +int bamAttrArrayWrite(const float attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='f'; + *( (float*) (attrArray+3))=attr; + return 3+sizeof(int32); +}; +int bamAttrArrayWrite(const char attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='A'; + attrArray[3]=attr; + return 3+sizeof(char); +}; +int bamAttrArrayWrite(const string &attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='Z'; + memcpy(attrArray+3,attr.c_str(),attr.size()+1);//copy string data including \0 + return 3+attr.size()+1; +}; +int bamAttrArrayWrite(const vector &attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='B'; + attrArray[3]='c'; + *( (int32*) (attrArray+4))=attr.size(); + memcpy(attrArray+4+sizeof(int32),attr.data(),attr.size());//copy array data + return 4+sizeof(int32)+attr.size(); +}; +int bamAttrArrayWrite(const vector &attr, const char* tagName, char* attrArray ) { + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + attrArray[2]='B'; + attrArray[3]='i'; + *( (int32*) (attrArray+4))=attr.size(); + memcpy(attrArray+4+sizeof(int32),attr.data(),sizeof(int32)*attr.size());//copy array data + return 4+sizeof(int32)+sizeof(int32)*attr.size(); +}; + +int bamAttrArrayWriteSAMtags(string &attrStr, char *attrArray) {//write bam record into attrArray for string attribute attString + size_t pos1=0, pos2=0; + int nattr=0; + do {//cycle over multiple tags separated by tab + pos2 = attrStr.find('\t',pos1); + string attr1 = attrStr.substr(pos1, pos2-pos1); + pos1=pos2+1; + + if (attr1.empty()) + continue; //extra tab at the beginning, or consecutive tabs + + switch (attr1.at(3)) { + case 'i': + { + int32 a1=stol(attr1.substr(5)); + nattr += bamAttrArrayWrite(a1,attr1.c_str(),attrArray+nattr); + break; + }; + case 'A': + { + char a1=attr1.at(5); + nattr += bamAttrArrayWrite(a1,attr1.c_str(),attrArray+nattr); + break; + }; + break; + case 'Z': + { + string a1=attr1.substr(5); + nattr += bamAttrArrayWrite(a1,attr1.c_str(),attrArray+nattr); + break; + }; + case 'f': + { + float a1=stof(attr1.substr(5)); + nattr += bamAttrArrayWrite(a1,attr1.c_str(),attrArray+nattr); + break; + }; + }; + } while (pos2!= string::npos); + + return nattr; +}; + +template +int bamAttrArrayWriteInt(intType xIn, const char* tagName, char* attrArray, const Parameters &P) {//adapted from samtools + attrArray[0]=tagName[0];attrArray[1]=tagName[1]; + #define ATTR_RECORD_INT(_intChar,_intType,_intValue) attrArray[2] = _intChar; *(_intType*)(attrArray+3) = (_intType) _intValue; return 3+sizeof(_intType) + int64 x = (int64) xIn; + if (x < 0) { + if (x >= -127) { + ATTR_RECORD_INT('c',int8_t,x); + } else if (x >= -32767) { + ATTR_RECORD_INT('s',int16_t,x); + } else { + ATTR_RECORD_INT('i',int32_t,x); + if (!(x>=-2147483647)) { + ostringstream errOut; + errOut <<"EXITING because of FATAL BUG: integer out of range for BAM conversion: "<< x <<"\n"; + errOut <<"SOLUTION: contact Alex Dobin at dobin@cshl.edu\n"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + }; + } else { + if (x <= 255) { + ATTR_RECORD_INT('C',uint8_t,x); + } else if (x <= 65535) { + ATTR_RECORD_INT('S',uint16_t,x); + } else { + ATTR_RECORD_INT('I',uint32_t,x); + if (!(x<=4294967295)) { + ostringstream errOut; + errOut <<"EXITING because of FATAL BUG: integer out of range for BAM conversion: "<< x <<"\n"; + errOut <<"SOLUTION: contact Alex Dobin at dobin@cshl.edu\n"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + }; + }; +}; + + + + +int ReadAlign::alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint trChrStart, uint mateChr, uint mateStart, char mateStrand, int alignType, bool *mateMapped, vector outSAMattrOrder, char** outBAMarray, uint* outBAMarrayN) { + //return: number of lines (mates) + + //alignType>=0: unmapped reads + // -1: normal mapped reads + // -10: chimeric alignment, not supplemental (like -11,-12,-13) + // -11: chimeric alignment, supplemental, hard-clipping, chimeric junction on the left + // -12: chimeric alignment, supplemental, hard-clipping, chimeric junction on the right + // -13: chimeric alignment, supplemental, soft-clipping + + + if (P.outSAMmode=="None") return 0; //no SAM/BAM output + + uint32 recSize=0; //record size - total for both mates + outBAMarrayN[0]=0; + outBAMarrayN[1]=0; + + //for SAM output need to split mates + uint iExMate=0; //last exon of the first mate + + uint16 samFLAG=0; + + + bool flagPaired = P.readNmates==2; + + uint nMates=1; + if (alignType<0) {//mapped reads: SAM + for (iExMate=0;iExMate1 && P.outSAMtlen==2) { + tLen=max(trOut.exons[trOut.nExons-1][EX_G]+trOut.exons[trOut.nExons-1][EX_L],trOut.exons[iExMate][EX_G]+trOut.exons[iExMate][EX_L])-min(trOut.exons[0][EX_G],trOut.exons[iExMate+1][EX_G]); + leftMostMate=(trOut.exons[0][EX_G]<=trOut.exons[iExMate+1][EX_G] ? 0 : 1); + }; + + uint leftMate=0; //the mate (0 or 1) which is on the left + if (flagPaired) { + leftMate=trOut.Str; + }; + + if (P.outSAMattrPresent.MC) { + calcCIGAR(trOut, nMates, iExMate, leftMate); + }; + + for (uint imate=0;imate < (alignType<0 ? nMates:P.readNmates);imate++) { + + uint iEx1=0; + uint iEx2=0; + uint Mate=0; + uint Str=0; + uint32 packedCIGAR[BAM_CIGAR_MaxSize]; + uint32 nCIGAR=0; //number of CIGAR operations + int MAPQ=0; + uint32 attrN=0; + char attrOutArray[BAM_ATTR_MaxSize]; + uint trimL1=0, trimR1=0; + + if (alignType>=0) {//this mate is unmapped + if (mateMapped!=NULL && mateMapped[imate]) continue; //this mate was mapped, do not record it as unmapped + samFLAG=0x4; + if (P.readNmates==2) {//paired read + samFLAG|=0x1 + (imate==0 ? 0x40 : 0x80); + if (mateMapped[1-imate]) {//mate mapped + if (trOut.Str!=(1-imate)) + {//mate strand reverted + samFLAG|=0x20; + }; + mateChr=trOut.Chr; + trChrStart=mapGen.chrStart[mateChr]; + mateStart=trOut.exons[0][EX_G] - trChrStart; + mateStrand= trOut.Str == (1-imate) ? 0 : 1; + + if (!trOut.primaryFlag && P.outSAMunmapped.keepPairs) + {//mapped mate is not primary, keep unmapped mate for each pair, hence need to mark some as not primary + samFLAG|=0x100; + }; + + } else {//mate unmapped + samFLAG|=0x8; + }; + }; + + if (readFilter=='Y') samFLAG|=0x200; //not passing quality control + + if (mateMapped[1-imate]) + {//mate is mapped, fill the infromation from trOut + + }; + + Mate=imate; + Str=Mate; + + attrN=0; + attrN+=bamAttrArrayWriteInt(0,"NH",attrOutArray+attrN,P); + attrN+=bamAttrArrayWriteInt(0,"HI",attrOutArray+attrN,P); + attrN+=bamAttrArrayWriteInt(trOut.maxScore,"AS",attrOutArray+attrN,P); + attrN+=bamAttrArrayWriteInt(trOut.nMM,"nM",attrOutArray+attrN,P); + attrN+=bamAttrArrayWrite((to_string((uint) alignType)).at(0), "uT",attrOutArray+attrN); //cast to uint is only necessary for old compilers + + if (!P.outSAMattrRG.empty()) attrN+=bamAttrArrayWrite(P.outSAMattrRG.at(readFilesIndex),"RG",attrOutArray+attrN); + + } else {//this mate is mapped + if (flagPaired) {//paired reads + samFLAG=0x0001; + if (iExMate==trOut.nExons-1) {//single mate + if (mateChr>mapGen.nChrReal) samFLAG|=0x0008; //not mapped as pair + } else {//properly paired + samFLAG|=0x0002; //mapped as pair + }; + } else {//single end + samFLAG=0; + }; + + if (readFilter=='Y') samFLAG|=0x200; //not passing quality control + + if (alignType==-11 || alignType==-12 || alignType==-13) { + samFLAG|=0x800; //mark chimeric alignments + } else {//only non-chimeric alignments will be marked as non-primary, since chimeric are already marked with 0x800 + if (!trOut.primaryFlag) samFLAG|=0x100;//mark not primary align + }; + + iEx1 = (imate==0 ? 0 : iExMate+1); + iEx2 = (imate==0 ? iExMate : trOut.nExons-1); + Mate=trOut.exons[iEx1][EX_iFrag]; + Str= trOut.Str;//note that Strand = the mate on the left + + if (Mate==0) { + samFLAG|= Str*0x10; + if (nMates==2) samFLAG|= (1-Str)*0x20; + } else {//second mate strand need to be reverted + samFLAG|= (1-Str)*0x10; + if (nMates==2) samFLAG|= Str*0x20; + }; + + if (flagPaired) { + samFLAG|= (Mate==0 ? 0x0040 : 0x0080); + if (flagPaired && nMates==1 && mateStrand==1) samFLAG|=0x20;//revert strand using inout value of mateStrand (e.g. for chimeric aligns) + }; + + + uint trimL; + if (Str==0 && Mate==0) { + trimL=clip5pNtotal[Mate]; + } else if (Str==0 && Mate==1) { + trimL=clip3pNtotal[Mate]; + } else if (Str==1 && Mate==0) { + trimL=clip3pNtotal[Mate]; + } else { + trimL=clip5pNtotal[Mate]; + }; + + nCIGAR=0; //number of CIGAR operations + + trimL1 = trimL + trOut.exons[iEx1][EX_R] - (trOut.exons[iEx1][EX_R]0) { + packedCIGAR[nCIGAR++]=trimL1< SJintron; + vector SJmotif; + + for (uint ii=iEx1;ii<=iEx2;ii++) { + if (ii>iEx1) {//record gaps + uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); + uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; + //it's possible to have a D or N and I at the same time + if (gapR>0){ + + packedCIGAR[nCIGAR++]=gapR<=0 || trOut.sjAnnot[ii-1]==1) {//junction: N + + packedCIGAR[nCIGAR++]=gapG<0) {//deletion: N + packedCIGAR[nCIGAR++]=gapG< 0 ) { + packedCIGAR[nCIGAR++]=trimR1<=5) { + MAPQ=0; + } else if (nTrOut>=3) { + MAPQ=1; + } else if (nTrOut==2) { + MAPQ=3; + }; + + //attribute string + uint tagNM=(uint) -1; + string tagMD(""); + + attrN=0; + for (uint ii=0;ii rb; + for (uint ii=iEx1;ii<=iEx2;ii++) { + rb.push_back( (int32) trOut.exons[ii][EX_R]+1 ); + rb.push_back( (int32) trOut.exons[ii][EX_R]+trOut.exons[ii][EX_L]); + rb.push_back( (int32) (trOut.exons[ii][EX_G]-mapGen.chrStart[trOut.Chr]+1) ); + rb.push_back( (int32) (trOut.exons[ii][EX_G]-mapGen.chrStart[trOut.Chr]+trOut.exons[ii][EX_L]) ); + }; + attrN+=bamAttrArrayWrite(rb,"rB",attrOutArray+attrN); + }; + break; + case ATTR_vG: + { + const vector &v1=trOut.varGenCoord; + if (v1.size()>0) + attrN+=bamAttrArrayWrite(v1,"vG",attrOutArray+attrN); + break; + }; + case ATTR_vA: + { + const vector &v1=trOut.varAllele; + if (v1.size()>0) + attrN+=bamAttrArrayWrite(v1,"vA",attrOutArray+attrN); + break; + }; + case ATTR_vW: + { + if (waspType!=-1) + attrN+=bamAttrArrayWrite( (int32) waspType, "vW", attrOutArray+attrN ); + break; + }; + + case ATTR_ch: + if (alignType<=-10) + {//chimeric alignment + attrN+=bamAttrArrayWrite('1',"ch",attrOutArray+attrN); + }; + break; + case ATTR_MC: + if (nMates>1) + {//chimeric alignment + attrN+=bamAttrArrayWrite(matesCIGAR[1-imate],"MC",attrOutArray+attrN); + }; + break; + + case ATTR_CR: + attrN+=bamAttrArrayWrite(soloRead->readBar->cbSeq,"CR",attrOutArray+attrN); + break; + case ATTR_CY: + attrN+=bamAttrArrayWrite(soloRead->readBar->cbQual,"CY",attrOutArray+attrN); + break; + case ATTR_UR: + attrN+=bamAttrArrayWrite(soloRead->readBar->umiSeq,"UR",attrOutArray+attrN); + break; + case ATTR_UY: + attrN+=bamAttrArrayWrite(soloRead->readBar->umiQual,"UY",attrOutArray+attrN); + break; + + default: + ostringstream errOut; + errOut <<"EXITING because of FATAL BUG: unknown/unimplemented SAM/BAM atrribute (tag): "<logMain, EXIT_CODE_PARAMETER, P); + }; + }; + }; + + if (P.readFilesTypeN==10) { +// if (readNameExtra[Mate].size()<1) +// cout << iReadAll <<" " < reg2bin() function in Section 4.3; l read name is the length> of read name below (= length(QNAME) + 1).> uint32 t + if (alignType<0) { + pBAM[3]=( ( reg2bin(trOut.exons[iEx1][EX_G] - trChrStart,trOut.exons[iEx2][EX_G] + trOut.exons[iEx2][EX_L] - trChrStart) << 16 ) \ + |( MAPQ<<8 ) | ( strlen(readName) ) ); //note:read length includes 0-char + } else { + pBAM[3]=( reg2bin(-1,0) << 16 | strlen(readName) );//4680=reg2bin(-1,0) + }; + + //4: FLAG<<16|n cigar op; n cigar op is the number of operations in CIGAR. + pBAM[4]=( ( ((samFLAG & P.outSAMflagAND) | P.outSAMflagOR) << 16 ) | (nCIGAR) ); + + //5: l seq Length of SEQ + pBAM[5]=seqMateLength; + + //6: next refID Ref-ID of the next segment (ô€€€1  mate refID < n ref) + if (nMates>1) { + pBAM[6]=trOut.Chr; + } else if (mateChr1) { + pBAM[7]=trOut.exons[(imate==0 ? iExMate+1 : 0)][EX_G] - trChrStart; + } else if (mateChr1) { + if (P.outSAMtlen==1) { + int32 tlen=trOut.exons[trOut.nExons-1][EX_G]+trOut.exons[trOut.nExons-1][EX_L]-trOut.exons[0][EX_G]; + pBAM[8]=(imate==0 ? tlen : -tlen); + } else if (P.outSAMtlen==2) { + int32 tlen=(int32)tLen; + pBAM[8]=(imate==leftMostMate ? tlen : -tlen); + }; + } else { + pBAM[8]=0; + }; + + recSize+=9*sizeof(int32); //core record size + + //Read name1, NULL terminated (QNAME plus a tailing `\0') + strcpy(outBAMarray[imate]+recSize,readName+1); + recSize+=strlen(readName); + + //CIGAR: op len<<4|op. `MIDNSHP=X'!`012345678' + memcpy(outBAMarray[imate]+recSize,packedCIGAR, nCIGAR*sizeof(int32)); + recSize+=nCIGAR*sizeof(int32); + + //4-bit encoded read: `=ACMGRSVTWYHKDBN'! [0; 15]; other characters mapped to `N'; high nybble rst (1st base in the highest 4-bit of the 1st byte) + memcpy(outBAMarray[imate]+recSize,seqMate,(seqMateLength+1)/2); + recSize+=(seqMateLength+1)/2; + + //Phred base quality (a sequence of 0xFF if absent) + if (readFileType==2 && P.outSAMmode != "NoQS") {//output qualtiy + for (uint32 ii=0; ii>P.winBinNbits]; + + if (iW==uintWinBinMax || (!aAnchor && aLength < WALrec[iW]) ) return; //alignment does not belong to any window, or it's shorter than rec-length + + //check if this alignment overlaps with any other alignment in the window, record the longest of the two + {//do not check for overlap if this is an sj-align + uint iA; + for (iA=0; iA=WA[iW][iA][WA_rStart] && aRstart=WA[iW][iA][WA_rStart] && aRstart+aLengthWA[iW][iA][WA_Length]) {//replace + + uint iA0;//iA0 is where the align has to be inserted + for (iA0=0;iA0iA) + {//true insertion place since iA will be removed + --iA0; + }; + + if (iA0iA0;iA1--) {//shift aligns to free up insertion point + for (uint ii=0;iiiA) {//shift aligns up + for (uint iA1=iA;iA1 WALrec[iW] ) {//re-record the anchors and long aligns + for (uint ii=0; ii WALrec[iW] ) { + if (nWA[iW]>=P.seedPerWindowNmax) { + exitWithError("BUG: iA>=P.seedPerWindowNmax in stitchPieces, exiting",std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + + uint iA; + for (iA=0;iAiA;iA1--) {//shift aligns for free up insertion point + for (uint ii=0;ii0) { + samStreamCIGAR << trimL1 << "S"; //initial trimming + }; + + for (uint ii=iEx1;ii<=iEx2;ii++) { + if (ii>iEx1) {//record gaps + uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); + uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; + //it's possible to have a D or N and I at the same time + if (gapR>0){ + samStreamCIGAR << gapR; + samStreamCIGAR << "I"; + }; + if (trOut.canonSJ[ii-1]>=0 || trOut.sjAnnot[ii-1]==1) {//junction: N + samStreamCIGAR << gapG; + samStreamCIGAR << "N"; + } else if (gapG>0) {//deletion: N + samStreamCIGAR << gapG; + samStreamCIGAR << "D"; + }; + }; + samStreamCIGAR << trOut.exons[ii][EX_L] << "M"; + }; + + uint trimR1=(trOut.exons[iEx1][EX_R] 0 ) { + samStreamCIGAR << trimR1 << "S"; //final trimming + }; + matesCIGAR.push_back(samStreamCIGAR.str()); + }; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/ReadAlign_chimericDetection.cpp b/star-sys/STAR/source/ReadAlign_chimericDetection.cpp new file mode 100644 index 0000000..aabd355 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_chimericDetection.cpp @@ -0,0 +1,57 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ReadAlign.h" +#include "BAMfunctions.h" +#include "blocksOverlap.h" + +//#include "SequenceFuns.h" +//#include "stitchWindowAligns.h" +//#include "sjSplitAlign.cpp" +//#include "PackedArray.h" +//#include "alignSmithWaterman.h" +//#include "GlobalVariables.h" +//#include + +void ReadAlign::chimericDetection() { + + chimRecord=false; + + if (P.pCh.segmentMin==0) {//no chimeric detection requested + return; + }; + if (P.outFilterBySJoutStage>1) {//no chimeric output for stage=2. REVISIT: NOT SURE why + return; + }; + + //output chains for out-of-STAR chimeric detection + #ifdef OUTPUT_localChains + { + P.inOut->outLocalChains << readName <<"\t"<< Read0[0] <<"\t"<< Read0[1] << "\n"; + for (uint iw=0; iwoutLocalChains << trAll[iw][itr]->maxScore<<"\t"<< trAll[iw][itr]->Chr<<"\t"<Str<<"\t"<nExons; + for (uint ib=0;ibnExons;ib++) { + P.inOut->outLocalChains <<"\t"<< trAll[iw][itr]->exons[ib][EX_G]-mapGen.chrStart[trAll[iw][itr]->Chr] \ + <<"\t"<< trAll[iw][itr]->exons[ib][EX_R] <<"\t"<< trAll[iw][itr]->exons[ib][EX_L]; + }; + P.inOut->outLocalChains <<"\n"; + }; + }; + }; + #endif + + + if (P.pCh.multimapNmax==0) { + chimRecord=chimericDetectionOld(); + chimericDetectionOldOutput(); + } else if (trBest->maxScore <= (int) (readLength[0]+readLength[1]) - (int) P.pCh.nonchimScoreDropMin) {//require big enough drop in the best score + chimRecord=chimDet->chimericDetectionMult(nW, readLength, trBest->maxScore, false); + }; + + if ( chimRecord ) { + statsRA.chimericAll++; + }; + + return; +};//END diff --git a/star-sys/STAR/source/ReadAlign_chimericDetectionOld.cpp b/star-sys/STAR/source/ReadAlign_chimericDetectionOld.cpp new file mode 100644 index 0000000..715ece1 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_chimericDetectionOld.cpp @@ -0,0 +1,312 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ReadAlign.h" +#include "blocksOverlap.h" + +bool ReadAlign::chimericDetectionOld() { + + //////////////////// chimeras + //stich windows => chimeras + //stich only the best window with one of the lower score ones for now - do not stich 2 lower score windows + //stitch only one window on each end of the read + + if (nTr>P.pCh.mainSegmentMultNmax && nTr!=2) + {//multimapping main segment, nTr==2 is a special case to be checked later + return false; + }; + + if ( !(P.pCh.segmentMin>0 && trBest->rLength >= P.pCh.segmentMin \ + && ( trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] + P.pCh.segmentMin <= Lread \ + || trBest->exons[0][EX_R] >= P.pCh.segmentMin ) \ + && trBest->intronMotifs[0]==0 && (trBest->intronMotifs[1]==0 || trBest->intronMotifs[2]==0) ) ) { + //there sholud be unmapped space at the start/end, and the main window is not a multimapping window, and non non-canonical junctions, and consistend junction motif + return false; + }; + + int chimScoreBest=0,chimScoreNext=0; + trChim[0]=*trBest; + Transcript* trChim1=NULL; + + uint roStart1=trBest->Str==0 ? trBest->exons[0][EX_R] : Lread - trBest->exons[trBest->nExons-1][EX_R] - trBest->exons[trBest->nExons-1][EX_L]; + uint roEnd1=trBest->Str==0 ? trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] - 1 : Lread - trBest->exons[0][EX_R] - 1; + if (roStart1>readLength[0]) roStart1--; + if (roEnd1>readLength[0]) roEnd1--; + + uint chimStrBest=0; + if (trBest->intronMotifs[1]==0 && trBest->intronMotifs[2]==0) {//strand is undefined + chimStr=0; + } else if ( (trBest->Str==0) == (trBest->intronMotifs[1]>0)) {//strand the same as RNA + chimStr=1; + } else {//strand opposite to RNA + chimStr=2; + }; + + for (uint iW=0; iW0) break; //for all windows except that of the best transcript - hceck only iWt=0 (best trnascripts) + if (trBest==trAll[iW][0] && iWt==0) continue; + if (trAll[iW][iWt]->intronMotifs[0]>0) continue; //do not stitch a window to itself, or to a window with non-canonical junctions + uint chimStr1; + if (trAll[iW][iWt]->intronMotifs[1]==0 && trAll[iW][iWt]->intronMotifs[2]==0) {//strand is undefined + chimStr1=0; + } else if ( (trAll[iW][iWt]->Str==0) == (trAll[iW][iWt]->intronMotifs[1]>0)) {//strand the same as RNA + chimStr1=1; + } else {//strand opposite to RNA + chimStr1=2; + }; + + if (chimStr!=0 && chimStr1!=0 && chimStr!=chimStr1) continue; //chimeric segments have to have consitent strands + + uint roStart2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[0][EX_R] : Lread - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L]; + uint roEnd2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] + trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L] - 1 : Lread - trAll[iW][iWt]->exons[0][EX_R] - 1; + if (roStart2>readLength[0]) roStart2--; + if (roEnd2>readLength[0]) roEnd2--; + + uint chimOverlap = roStart2>roStart1 ? (roStart2>roEnd1 ? 0 : roEnd1-roStart2+1) : (roEnd2= readLength[0]) || (roEnd2 < readLength[0] && roStart1 >= readLength[0]); + + //segment lengths && (different mates || small gap between segments) + if (roEnd1 > P.pCh.segmentMin + roStart1 + chimOverlap && roEnd2> P.pCh.segmentMin + roStart2 + chimOverlap \ + && ( diffMates || ( (roEnd1 + P.pCh.segmentReadGapMax + 1) >= roStart2 && (roEnd2 + P.pCh.segmentReadGapMax + 1) >= roStart1 ) ) ) { + + int chimScore=trBest->maxScore + trAll[iW][iWt]->maxScore - (int)chimOverlap; //subtract overlap to avoid double counting + + uint overlap1=0; + if (iWt>0 && chimScoreBest>0) + {//overlap between chimeric candidate segment and the best chimeric segment so far. Maybe non-zero only if both are in the same window. + overlap1=blocksOverlap(trChim[1],*trAll[iW][iWt]); + }; + + if (chimScore > chimScoreBest) { + trChim[1]=*trAll[iW][iWt]; + trChim1=trAll[iW][iWt]; + if (overlap1==0) + { + chimScoreNext=chimScoreBest; + }; + chimScoreBest=chimScore; + trChim[1].roStart = trChim[1].roStr ==0 ? trChim[1].rStart : Lread - trChim[1].rStart - trChim[1].rLength; + trChim[1].cStart = trChim[1].gStart - mapGen.chrStart[trChim[1].Chr]; + chimStrBest=chimStr1; + } else if (chimScore>chimScoreNext && overlap1==0) {//replace the nextscore if it's not the best one and is higher than the previous one + chimScoreNext=chimScore; + }; + }; + };//cycle over window transcripts + };//cycle over windows + + if (!(chimScoreBest >= P.pCh.scoreMin && chimScoreBest+P.pCh.scoreDropMax >= (int) (readLength[0]+readLength[1]) ) ) { + return false; + }; + + if (nTr>P.pCh.mainSegmentMultNmax) {//check main segment for multi-aligns + //this is nTr==2 - a special case: chimeras are allowed only if the 2nd chimeric segment is the next best alignment + if ( trChim1!=trMult[0] && trChim1!=trMult[1] ) { + return false; + }; + }; + + if (chimStr==0) chimStr=chimStrBest; + + chimN=0; + if (chimScoreNext + P.pCh.scoreSeparation >= chimScoreBest) {//report only if chimera is unique + //cout << " " << chimScoreBest << " " << chimScoreNext; + return false; + }; + if (trChim[0].roStart > trChim[1].roStart) swap (trChim[0],trChim[1]); + + uint e0 = trChim[0].Str==1 ? 0 : trChim[0].nExons-1; + uint e1 = trChim[1].Str==0 ? 0 : trChim[1].nExons-1; + + chimRepeat0=0;chimRepeat1=0;chimJ0=0;chimJ1=0;chimMotif=0; + + chimN=2; + if ( trChim[0].exons[e0][EX_iFrag] > trChim[1].exons[e1][EX_iFrag] ) {//strange configuration, rare, similar to the next one + return false;//reject such chimeras + //good test example: + //CTTAGCTAGCAGCGTCTTCCCAGTGCCTGGAGGGCCAGTGAGAATGGCACCCTCTGGGATTTTTGCTCCTAGGTCT + //TTGAGGTGAAGTTCAAAGATGTGGCTGGCTGTGAGGAGGCCGAGCTAGAGATCATGGAATTTGTGAATTTCTTGAA + } else if ( trChim[0].exons[e0][EX_iFrag] < trChim[1].exons[e1][EX_iFrag] ) {//mates bracket the chimeric junction + chimN=2; + chimRepeat=0; + chimMotif=-1; + if (trChim[0].Str==1) {//negative strand + chimJ0=trChim[0].exons[e0][EX_G]-1; + } else { + chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]; + }; + if (trChim[1].Str==0) {//positive strand + chimJ1=trChim[1].exons[e1][EX_G]-1; + } else { + chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]; + }; + } else {//chimeric junctions is within one of the mates, check and shift chimeric junction if necessary + if (!(trChim[0].exons[e0][EX_L]>=P.pCh.junctionOverhangMin && trChim[1].exons[e1][EX_L]>=P.pCh.junctionOverhangMin )) { + //large enough overhang required + return false; + }; + uint roStart0 = trChim[0].Str==0 ? trChim[0].exons[e0][EX_R] : Lread - trChim[0].exons[e0][EX_R] - trChim[0].exons[e0][EX_L]; + uint roStart1 = trChim[1].Str==0 ? trChim[1].exons[e1][EX_R] : Lread - trChim[1].exons[e1][EX_R] - trChim[1].exons[e1][EX_L]; + + uint jR, jRbest=0; + int jScore=0,jMotif=0,jScoreBest=-999999,jScoreJ=0; + uint jRmax = roStart1+trChim[1].exons[e1][EX_L]; + jRmax = jRmax>roStart0 ? jRmax-roStart0-1 : 0; + for (jR=0; jR3 || b1>3) ) || bR>3) {//chimera is not called if there are Ns in the genome or in the read + chimN=0; + break; + }; + + char b01,b02,b11,b12; + if (trChim[0].Str==0) { + b01=mapGen.G[trChim[0].exons[e0][EX_G]+jR+1]; + b02=mapGen.G[trChim[0].exons[e0][EX_G]+jR+2]; + } else { + b01=mapGen.G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-1]; + if (b01<4) b01=3-b01; + b02=mapGen.G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-2]; + if (b02<4) b02=3-b02; + }; + if (trChim[1].Str==0) { + b11=mapGen.G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR-1]; + b12=mapGen.G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR]; + } else { + b11=mapGen.G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR+1]; + if (b11<4) b11=3-b11; + b12=mapGen.G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR]; + if (b12<4) b12=3-b12; + }; + + jMotif=0; + if (b01==2 && b02==3 && b11==0 && b12==2) {//GTAG + if (chimStr!=2) { + jMotif=1; + }; + } else if(b01==1 && b02==3 && b11==0 && b12==1) {//CTAC + if (chimStr!=1) { + jMotif=2; + }; + }; + + if (bR==b0 && bR!=b1) { + jScore++; + } else if (bR!=b0 && bR==b1) { + jScore--; + }; + + jScoreJ =jMotif==0 ? jScore + P.pCh.scoreJunctionNonGTAG : jScore ; + + if ( jScoreJ > jScoreBest || (jScoreJ == jScoreBest && jMotif>0) ) { + chimMotif=jMotif; + jRbest=jR; + jScoreBest=jScoreJ; + }; + };//jR cycle + if ( chimN==0 ) {//the chimera was rejected because of mismatches + return false; + }; + + if (chimMotif==0) {//non-canonical chimera + chimScoreBest += 1+P.pCh.scoreJunctionNonGTAG; //+1 + if ( !(chimScoreBest >= P.pCh.scoreMin && chimScoreBest+P.pCh.scoreDropMax >= (int) (readLength[0]+readLength[1])) ) { + return false; + }; + }; + + + //shift junction in trChim + if (trChim[0].Str==1) { + trChim[0].exons[e0][EX_R] +=trChim[0].exons[e0][EX_L]-jRbest-1; + trChim[0].exons[e0][EX_G] +=trChim[0].exons[e0][EX_L]-jRbest-1; + trChim[0].exons[e0][EX_L]=jRbest+1; + chimJ0=trChim[0].exons[e0][EX_G]-1; + } else { + trChim[0].exons[e0][EX_L]=jRbest+1; + chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]; + }; + + if (trChim[1].Str==0) { + trChim[1].exons[e1][EX_R] +=roStart0+jRbest+1-roStart1; + trChim[1].exons[e1][EX_G] +=roStart0+jRbest+1-roStart1; + trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1; + chimJ1=trChim[1].exons[e1][EX_G]-1; + } else { + trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1; + chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]; + }; + //find repeats + char b0,b1; + for (jR=0;jR<100;jR++) {//forward check + if (trChim[0].Str==0) { + b0=mapGen.G[chimJ0+jR]; + } else { + b0=mapGen.G[chimJ0-jR]; + if (b0<4) b0=3-b0; + }; + + if (trChim[1].Str==0) { + b1=mapGen.G[chimJ1+1+jR]; + } else { + b1=mapGen.G[chimJ1-1-jR]; + if (b1<4) b1=3-b1; + }; + if (b0!=b1) break; + }; + chimRepeat1=jR; + for (jR=0;jR<100;jR++) {//reverse check + if (trChim[0].Str==0) { + b0=mapGen.G[chimJ0-1-jR]; + } else { + b0=mapGen.G[chimJ0+1+jR]; + if (b0<4) b0=3-b0; + }; + + if (trChim[1].Str==0) { + b1=mapGen.G[chimJ1-jR]; + } else { + b1=mapGen.G[chimJ1+jR]; + if (b1<4) b1=3-b1; + }; + if (b0!=b1) break; + }; + chimRepeat0=jR; + };//chimeric junction is within a mate + + //final check + if ( trChim[0].Str!=trChim[1].Str || trChim[0].Chr!=trChim[1].Chr \ + || (trChim[0].Str==0 ? chimJ1-chimJ0+1LLU : chimJ0-chimJ1+1LLU) > (chimMotif>=0 ? P.alignIntronMax : P.alignMatesGapMax) ) { + //chimera has to bw from different chr/strand, or far away + + if (chimMotif>=0 && \ + (trChim[0].exons[e0][EX_L] +#include "ReadAlign.h" +#include "BAMfunctions.h" + +void ReadAlign::chimericDetectionOldOutput() { + + if (!chimRecord) { + return; + }; + + throw std::runtime_error("Unimplemented!"); + + chimN=2; //this is hard-coded for now + //re-calculate the score for chimeric transcripts + trChim[0].alignScore(Read1, mapGen.G, P); + trChim[1].alignScore(Read1, mapGen.G, P); + + int chimRepresent=-999, chimType=0; + if (trChim[0].exons[0][EX_iFrag]!=trChim[0].exons[trChim[0].nExons-1][EX_iFrag]) {//tr0 has both mates + chimRepresent = 0; + chimType = 1; + trChim[0].primaryFlag=true;//paired portion is primary + trChim[1].primaryFlag=false; + } else if (trChim[1].exons[0][EX_iFrag]!=trChim[1].exons[trChim[1].nExons-1][EX_iFrag]) {//tr1 has both mates + chimRepresent = 1; + chimType = 1; + trChim[1].primaryFlag=true;//paired portion is primary + trChim[0].primaryFlag=false; + } else if (trChim[0].exons[0][EX_iFrag]!=trChim[1].exons[0][EX_iFrag]) {//tr0 and tr1 are single different mates + chimRepresent = -1; + chimType = 2; + trChim[0].primaryFlag=true; + trChim[1].primaryFlag=true; + } else {//two chimeric segments are on the same mate - this can only happen for single-end reads + chimRepresent = (trChim[0].maxScore > trChim[1].maxScore) ? 0 : 1; + chimType = 3; + trChim[chimRepresent].primaryFlag=true; + trChim[1-chimRepresent].primaryFlag=false; + }; + + if (P.pCh.out.bam) {//BAM output + int alignType, bamN=0, bamIsuppl=-1, bamIrepr=-1; + uint bamBytesTotal=0;//estimate of the total size of all bam records, for output buffering + uint mateChr,mateStart; + uint8_t mateStrand; + for (uint itr=0;itr=0) { + bam1_t *b; + //b=bam_init1(); + bam_read1_fromArray(outBAMoneAlign[tagI], b); + uint8_t* auxp=NULL;//bam_aux_get(b,"NM"); + uint32_t auxv= 0;//bam_aux2i(auxp); + string tagSA1="SAZ"+mapGen.chrName[b->core.tid]+','+to_string((uint)b->core.pos+1) +',' + ( (b->core.flag&0x10)==0 ? '+':'-') + \ + ',' + bam_cigarString(b) + ',' + to_string((uint)b->core.qual) + ',' + to_string((uint)auxv) + ';' ; + + memcpy( (void*) (outBAMoneAlign[ii]+outBAMoneAlignNbytes[ii]), tagSA1.c_str(), tagSA1.size()+1);//copy string including \0 at the end + outBAMoneAlignNbytes[ii]+=tagSA1.size()+1; + * ( (uint32*) outBAMoneAlign[ii] ) = outBAMoneAlignNbytes[ii]-sizeof(uint32); + }; + + if (P.outBAMunsorted) outBAMunsorted->unsortedOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], ii>0 ? 0 : bamBytesTotal); + if (P.outBAMcoord) outBAMcoord->coordOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], (iReadAll<<32) ); + }; + }; + + if (P.pCh.out.samOld) { + for (uint iTr=0;iTrtrChim[ii-1].exons[0][EX_G])) { + segLmin=segLen[ii][jj]; + i1=ii;//trChim of the shortest segment length + i2=jj;//mate of the shortest segment length + }; + }; + }; + + if (i2==1) {//eliminate mate1: simply cut the exons that belong to mate1 + trChim[i1].nExons=segEx[i1]+1; + } else {//eliminate mate 0: shift mate1 exon to the beginning + for (uint iex=0; iexmaxScore <= (int) (readLength[0]+readLength[1]) - (int) P.pCh.nonchimScoreDropMin) {//require big enough drop in the best score + + // new chimeric detection routine + + chimRecord=seRA.chimDet->chimericDetectionMult(seRA.nW, seRA.readLength, seRA.trBest->maxScore, true); + }; + + if ( chimRecord ) { + statsRA.chimericAll++; + }; + + return; +}; diff --git a/star-sys/STAR/source/ReadAlign_createExtendWindowsWithAlign.cpp b/star-sys/STAR/source/ReadAlign_createExtendWindowsWithAlign.cpp new file mode 100644 index 0000000..d6b8004 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_createExtendWindowsWithAlign.cpp @@ -0,0 +1,82 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +//#include "Transcript.h" +#include "ReadAlign.h" +#include "SequenceFuns.h" + +int ReadAlign::createExtendWindowsWithAlign(uint a1, uint aStr) { + + uint aBin = (a1 >> P.winBinNbits); //align's bin + uint iBinLeft=aBin, iBinRight=aBin; + uintWinBin* wB=winBin[aStr]; + uint iBin=-1, iWin=-1, iWinRight=-1; + + if (wB[aBin]==uintWinBinMax) {//proceed if there is no window at this bin + //check neighboring bins + bool flagMergeLeft=false; + if (aBin>0) {//merge left only if there are bins on the left + for (iBin=aBin-1; iBin >= ( aBin>P.winAnchorDistNbins ? aBin-P.winAnchorDistNbins : 0 ); --iBin) {//go left, find windows in Anchor range + if (wB[iBin]>P.winBinChrNbits]==mapGen.chrBin[aBin>>P.winBinChrNbits]); + if (flagMergeLeft) {//this align can be merged into the existing window + iWin=wB[iBin]; + iBinLeft=WC[iWin][WC_gStart]; + for (uint ii=iBin+1; ii<=aBin; ii++) {//mark al bins with the existing windows ID + wB[ii]=iWin; + }; + }; + }; + + bool flagMergeRight=false; + if (aBin+1>P.winBinChrNbits]==mapGen.chrBin[aBin>>P.winBinChrNbits]); + if (flagMergeRight) {//this align can be merged into the existing window + while (wB[iBin]==wB[iBin+1]) ++iBin; //extend through all bins of the right window + iBinRight=iBin; + iWinRight=wB[iBin]; +// if (iBin!=WC[iWinRight][WC_gEnd]) {//debug, switch off!!! +// cerr <<"BUG in createWindows"<> P.winBinChrNbits]; + WC[iWin][WC_Str]=aStr; + WC[iWin][WC_gEnd]=WC[iWin][WC_gStart]=aBin; + ++nW; + if (nW>=P.alignWindowsPerReadNmax) { + nW=P.alignWindowsPerReadNmax-1; + return EXIT_createExtendWindowsWithAlign_TOO_MANY_WINDOWS; //too many windows, do not record TODO: record a marker + }; + } else {//record windows after merging + WC[iWin][WC_gStart]=iBinLeft; + WC[iWin][WC_gEnd]=iBinRight; + if (flagMergeLeft && flagMergeRight) {//kill right window, it was merged with the left one + WC[iWinRight][WC_gStart]=1; + WC[iWinRight][WC_gEnd]=0; + }; + }; + }; + return 0; +}; + diff --git a/star-sys/STAR/source/ReadAlign_mapOneRead.cpp b/star-sys/STAR/source/ReadAlign_mapOneRead.cpp new file mode 100644 index 0000000..01e5954 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_mapOneRead.cpp @@ -0,0 +1,122 @@ +#include "ReadAlign.h" +#include "SequenceFuns.h" +#include "Stats.h" +#include "serviceFuns.cpp" + +int ReadAlign::mapOneRead() { + + #ifdef OFF_BEFORE_SEEDING + #warning OFF_BEFORE_SEEDING + nW=0; + return 0; + #endif + + revertStrand = false; //the 2nd read is awlays on opposite strand. 1st and 2nd reads have already been reversed. + + if (Lread>0) { + //printf("splitting\n"); + Nsplit=qualitySplit(Read1[0], Qual1[0], Lread, P.Qsplit, P.maxNsplit, P.minLsplit, splitR); + // splitR[0][fragnum] => good region start position (from SequenceFuns.cpp) + // splitR[1][fragnum] => good reagion length + // splitR[2][fragnum] => fragnum ? + } else { + Nsplit=0; + }; + //printf("read %s; qual %s\n", Read1[0], Qual1[0]); + //printf("Nsplit %llu\n", Nsplit); + + resetN(); //reset aligns counters to 0 + + //reset/initialize a transcript + trInit->reset(); + trInit->Chr=0; trInit->Str=0; trInit->roStr=0; trInit->cStart=0; trInit->gLength=0; //to generate nice output of 0 for non-mapped reads + trInit->iRead=iRead; + trInit->Lread=Lread; + trInit->nExons=0; + trInit->readLengthOriginal=readLengthOriginal; + trInit->readLengthPairOriginal=readLengthPairOriginal; + trInit->readLength=readLength; + trInit->readNmates=readNmates; + trInit->readName=readName; + + trBest=trInit; + + uint seedSearchStartLmax=min(P.seedSearchStartLmax, // 50 + (uint) (P.seedSearchStartLmaxOverLread*(Lread-1))); // read length + // align all good pieces + for (uint ip=0; ip0 && seedSearchStartLmax0) {//check if the 1st piece in reveree direction does not need to be remapped + Lmapped=0; // length of segment mapped so far. + + // begin mapping starting from segment start position (istart*Lstart) + while ( istart*Lstart + Lmapped + P.minLmap < splitR[1][ip] ) {//map until unmapped portion is <=minLmap (default: 5) + + // Shift is the position in the read to begin mapping from. + uint Shift = iDir==0 ? ( splitR[0][ip] + istart*Lstart + Lmapped ) : \ + ( splitR[0][ip] + splitR[1][ip] - istart*Lstart-1-Lmapped); //choose Shift for forward or reverse + + //uint seedLength=min(splitR[1][ip] - Lmapped - istart*Lstart, P.seedSearchLmax); + uint seedLength=splitR[1][ip] - Lmapped - istart*Lstart; // what's left of the read to align. + maxMappableLength2strands(Shift, seedLength, iDir, 0, mapGen.nSA-1, L, splitR[2][ip]);//L=max mappable length, unique or multiple + if (iDir==0 && istart==0 && Lmapped==0 && Shift+L == splitR[1][ip] ) {//this piece maps full length and does not need to be mapped from the opposite direction + flagDirMap=false; + }; + Lmapped+=L; + };//while ( istart*Lstart + Lmapped + P.minLmap < splitR[1][ip] ) + };//if (flagDirMap || istart>0) + if (P.seedSearchLmax>0) {//search fixed length. Not very efficient, need to improve + // off by default. + uint Shift = iDir==0 ? ( splitR[0][ip] + istart*Lstart ) : \ + ( splitR[0][ip] + splitR[1][ip] - istart*Lstart-1); //choose Shift for forward or reverse + uint seedLength = min(P.seedSearchLmax, iDir==0 ? (splitR[0][ip] + splitR[1][ip]-Shift):(Shift+1) ); + maxMappableLength2strands(Shift, seedLength, iDir, 0, mapGen.nSA-1, L, splitR[2][ip]);//L=max mappable length, unique or multiple + }; + + +// #endif + };//for (uint istart=0; istartrLength=0; //min good piece length + nW=0; + } else if (Nsplit==0) {//no good pieces + mapMarker=MARKER_NO_GOOD_PIECES; + trBest->rLength=splitR[1][0]; //min good piece length + nW=0; + } else if (Nsplit>0 && nA==0) { + mapMarker=MARKER_ALL_PIECES_EXCEED_seedMultimapNmax; + trBest->rLength=multNminL; + nW=0; + } else if (Nsplit>0 && nA>0) {//otherwise there are no good pieces, or all pieces map too many times: read cannot be mapped +// qsort((void*) PC, nP, sizeof(uint)*PC_SIZE, funCompareUint2);//sort PC by rStart and length + //printf("stitching\n"); + stitchPieces(Read1, Lread); + + }; + + return 0; +}; diff --git a/star-sys/STAR/source/ReadAlign_mappedFilter.cpp b/star-sys/STAR/source/ReadAlign_mappedFilter.cpp new file mode 100644 index 0000000..8d351c2 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_mappedFilter.cpp @@ -0,0 +1,17 @@ +#include "ReadAlign.h" + +void ReadAlign::mappedFilter() {//filter mapped read, add to stats + unmapType=-1;//mark as mapped + if ( nW==0 ) {//no good windows + unmapType=0; + } else if ( (trBest->maxScore < P.outFilterScoreMin) || (trBest->maxScore < (intScore) (P.outFilterScoreMinOverLread*(Lread-1))) \ + || (trBest->nMatch < P.outFilterMatchNmin) || (trBest->nMatch < (uint) (P.outFilterMatchNminOverLread*(Lread-1))) ) {//too short + unmapType=1; + } else if ( (trBest->nMM > outFilterMismatchNmaxTotal) || (double(trBest->nMM)/double(trBest->rLength)>P.outFilterMismatchNoverLmax) ) {//too many mismatches + unmapType=2; + } else if (nTr > P.outFilterMultimapNmax){//too multi + unmapType=3; + }; + + return; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/ReadAlign_maxMappableLength2strands.cpp b/star-sys/STAR/source/ReadAlign_maxMappableLength2strands.cpp new file mode 100644 index 0000000..ff5653f --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_maxMappableLength2strands.cpp @@ -0,0 +1,107 @@ +#include "ReadAlign.h" +#include "SuffixArrayFuns.h" +#include "ErrorWarning.h" + +uint ReadAlign::maxMappableLength2strands(uint pieceStartIn, uint pieceLengthIn, uint iDir, uint iSA1, uint iSA2, uint& maxLbest, uint iFrag) { + //returns number of mappings, maxMappedLength=mapped length + uint Nrep=0, indStartEnd[2], maxL; + + uint NrepAll[P.pGe.gSAsparseD], indStartEndAll[P.pGe.gSAsparseD][2], maxLall[P.pGe.gSAsparseD]; + maxLbest=0; + + bool dirR = iDir==0; + + // defaults: (from genomeParameters.txt) + // gSAsparseD = 1 + // gSAindexNbases = 14 + + for (uint iDist=0; iDist0) {//check the precense of the prefix for Lind + iSA1=mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1]; // starting point for suffix array search. + if ((iSA1 & mapGen.SAiMarkAbsentMaskC) == 0) {//prefix exists + break; + } else {//this prefix does not exist, reduce Lind + --Lind; + ind1 = ind1 >> 2; + }; + }; + + // define lower bound for suffix array range search. + if (mapGen.genomeSAindexStart[Lind-1]+ind1+1 < mapGen.genomeSAindexStart[Lind]) {//we are not at the end of the SA + iSA2=((mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1+1] & mapGen.SAiMarkNmask) & mapGen.SAiMarkAbsentMask) - 1; + } else { + iSA2=mapGen.nSA-1; + }; + + + //#define SA_SEARCH_FULL + + #ifdef SA_SEARCH_FULL + //full search of the array even if the index search gave maxL + maxL=0; + Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd); + #else + if (Lind < P.pGe.gSAindexNbases && (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no need for SA search + // very short seq, already found hits in suffix array w/o having to search the genome for extensions. + indStartEnd[0]=iSA1; + indStartEnd[1]=iSA2; + Nrep=indStartEnd[1]-indStartEnd[0]+1; + maxL=Lind; + } else if (iSA1==iSA2) {//unique align already, just find maxL + if ((iSA1 & mapGen.SAiMarkNmaskC)!=0) { + ostringstream errOut; + errOut << "BUG: in ReadAlign::maxMappableLength2strands"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + indStartEnd[0]=indStartEnd[1]=iSA1; + Nrep=1; + bool comparRes; + maxL=compareSeqToGenome(mapGen, Read1, pieceStart, pieceLength, Lind, iSA1, dirR, comparRes); + } else {//SA search, pieceLength>maxL + if ( (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no N in the prefix + maxL=Lind; + } else { + maxL=0; + }; + Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd); + }; + #endif + + if (maxL+iDist > maxLbest) {//this idist is better + maxLbest=maxL+iDist; + }; + NrepAll[iDist]=Nrep; + indStartEndAll[iDist][0]=indStartEnd[0]; + indStartEndAll[iDist][1]=indStartEnd[1]; + maxLall[iDist]=maxL; + }; + + for (uint iDist=0; iDist + +void ReadAlign::multMapSelect() {//select multiple mappers from all transcripts of all windows + + nTr=0; + if (nW==0) {//no good windows + return; + }; + //printf("heyyyy %llu\n", nW); + maxScore=-10*Lread; + for (uint iW=0; iWmaxScore) maxScore = trAll[iW][0]->maxScore; + }; + + if (maxScore!=trBest->maxScore) { + ostringstream errOut; + errOut << "BUG: maxScore!=trBest->maxScore in multMapSelect"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + + for (uint iW=0; iWmaxScore + P.outFilterMultimapScoreRange) >= maxScore ) {//record this alignment + // if paired-end, record alignments from ALL windows + if (nTr==MAX_N_MULTMAP) {//too many alignments for this read, do not record it + ostringstream errOut; + errOut << "EXITING: Fatal ERROR: number of alignments exceeds MAX_N_MULTMAP, increase it and re-compile STAR"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P); + }; + //printf("resetting trMult element yay %lld %lld\n", trAll[iW][iTr]->gStart, trAll[iW][iTr]->cStart); + trMult[nTr]=trAll[iW][iTr]; + trMult[nTr]->Chr = trAll[iW][0]->Chr; + trMult[nTr]->Str = trAll[iW][0]->Str; + trMult[nTr]->roStr = trAll[iW][0]->roStr; + + if ( (trAll[iW][iTr]->maxScore + P.outFilterMultimapScoreRange) >= maxScore) nTrMate++; + + nTr++; + }; + }; + }; + + if (nTr > P.outFilterMultimapNmax || nTr==0) + {//too multi OR no alignments, no need for further processing, since it will be considered unmapped + return; + }; + + //printf("made it here\n"); + + for (uint iTr=0; iTrroStart = trMult[iTr]->roStr==0 ? trMult[iTr]->rStart : Lread - trMult[iTr]->rStart - trMult[iTr]->rLength; + trMult[iTr]->cStart=trMult[iTr]->gStart - mapGen.chrStart[trMult[iTr]->Chr]; + //printf("new beginnings %llu %llu\n", trMult[iTr]->roStart, trMult[iTr]->cStart); + }; + +// if (P.outMultimapperOrder.sortCoord) +// {//sort multimappers by coordinate +// uint *s=new uint[nTr*2]; +// Transcript **t=new Transcript*[nTr]; +// for (uint itr=0; itrexons[0][EX_G]; +// s[itr*2+1]=itr; +// t[itr]=trMult[itr]; +// }; +// qsort((void*) s, nTr, sizeof(uint)*2, funCompareUint1); +// for (uint itr=0; itrprimaryFlag=true; + } else + {//multimappers + int nbest=0; + if (P.outMultimapperOrder.random || P.outSAMmultNmax != (uint) -1 ) + {//bring the best alignment to the top of the list. TODO sort alignments by the score? + for (uint itr=0; itrmaxScore == maxScore ) + { + swap(trMult[itr],trMult[nbest]); + ++nbest; + }; + }; + }; + if (P.outMultimapperOrder.random) + {//shuffle separately the best aligns, and the rest + for (int itr=nbest-1; itr>=1; itr--) + {//Fisher-Yates-Durstenfeld-Knuth shuffle + int rand1=int (rngUniformReal0to1(rngMultOrder)*itr+0.5); + swap(trMult[itr],trMult[rand1]); + }; + for (int itr=nTr-nbest-1; itr>=1; itr--) + {//Fisher-Yates-Durstenfeld-Knuth shuffle + int rand1=int (rngUniformReal0to1(rngMultOrder)*itr+0.5); + swap(trMult[nbest+itr],trMult[nbest+rand1]); + }; + }; + + if ( P.outSAMprimaryFlag=="AllBestScore" ) + { + for (uint itr=0; itrmaxScore == maxScore ) trMult[itr]->primaryFlag=true; + }; + } else if (P.outMultimapperOrder.random || P.outSAMmultNmax != (uint) -1) + { + trMult[0]->primaryFlag=true;//mark as primary the first one in the random ordered list: best scoring aligns are already in front of the list + // for (uint itr=0; itrmaxScore == maxScore ) trMult[itr]->primaryFlag=true; + // break; + // }; + } else + {//old way + trBest->primaryFlag=true; + }; + }; +}; + diff --git a/star-sys/STAR/source/ReadAlign_oneRead.cpp b/star-sys/STAR/source/ReadAlign_oneRead.cpp new file mode 100644 index 0000000..e6308a5 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_oneRead.cpp @@ -0,0 +1,89 @@ +#include "ReadAlign.h" +#include "readLoad.h" +#include "SequenceFuns.h" +#include "ErrorWarning.h" + +int ReadAlign::oneRead() {//process one read: load, map, write + //printf("at least called the method\n"); + //load read name, sequence, quality from the streams into internal arrays + int readStatus[2]; + + readStatus[0] = 0; + + std::istringstream is(readFastq[0]); + readStatus[0]=readLoad(is, P, 0, readLength[0], readLengthOriginal[0], readNameMates[0], Read0[0], Read1[0], Qual0[0], Qual1[0], clip3pNtotal[0], clip5pNtotal[0], clip3pAdapterN[0], iReadAll, readFilesIndex, readFilter, readNameExtra[0]); + if (readNmates==2) {//read the 2nd mate + std::istringstream is2(readFastq[1]); + + readStatus[1]=readLoad(is2, P, 1, readLength[1], readLengthOriginal[1], readNameMates[1], Read0[1], Read1[0]+readLength[0]+1, Qual0[1], Qual1[0]+readLength[0]+1, clip3pNtotal[1], clip5pNtotal[1], clip3pAdapterN[1], iReadAll, readFilesIndex, readFilter, readNameExtra[1]); + if (readStatus[0]!=readStatus[1]) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: Read1 and Read2 are not consistent, reached the end of the one before the other one\n"; + errOut << "SOLUTION: Check you your input files: they may be corrupted\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + } else if (readStatus[0]==-1) {//finished with the stream + return -1; + }; + + //combine two reads together + Lread=readLength[0]+readLength[1]+1; + readLengthPairOriginal=readLengthOriginal[0]+readLengthOriginal[1]+1; + if (Lread>DEF_readSeqLengthMax) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in reads input: Lread of the pair = " << Lread << " while DEF_readSeqLengthMax=" << DEF_readSeqLengthMax <logMain, EXIT_CODE_INPUT_FILES, P); + }; + + Read1[0][readLength[0]]=MARK_FRAG_SPACER_BASE; //marker for spacer base + Qual1[0][readLength[0]]=0; + complementSeqNumbers(Read1[0]+readLength[0]+1,Read1[0]+readLength[0]+1,readLength[1]); //returns complement of Reads[ii] + for (uint ii=0;iigStart); + multMapSelect(); + mappedFilter(); + + #ifdef OFF_BEFORE_OUTPUT + #warning OFF_BEFORE_OUTPUT + return 0; + #endif + + //write out alignments + //outputAlignments(); + + return 0; + +}; diff --git a/star-sys/STAR/source/ReadAlign_outputAlignments.cpp b/star-sys/STAR/source/ReadAlign_outputAlignments.cpp new file mode 100755 index 0000000..c083de2 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_outputAlignments.cpp @@ -0,0 +1,165 @@ +#include "ReadAlign.h" +#include "GlobalVariables.h" +#include "ErrorWarning.h" + +const char* ReadAlign::outputAlignments() { + outBAMbytes=0; + + bool mateMapped[2]={false,false}; + + set readGeneFull={},readGene={}; + vector readTranscripts={}; + vector readGeneExon={}; + + std::stringstream stream; + + outFilterPassed=true;//only false if the alignment is held for outFilterBySJoutStage + if (unmapType==-1) {//output transcripts + if (P.outFilterBySJoutStage==1) {//filtering by SJout + for (uint iTr=0;iTrnExons-1;iex++) {//check all junctions + if (trMult[iTr]->canonSJ[iex]>=0 && trMult[iTr]->sjAnnot[iex]==0) { + outFilterPassed=false; + break; + }; + }; + if (!outFilterPassed) break; + }; + if (!outFilterPassed) {//this read is held for further filtering BySJout, record fastq + unmapType=-3; //the read is not conisddred unmapped + for (uint im=0;imN; + for (uint iTr=0;iTrChrChr>=mapGen.genomeInsertChrIndFirst) { + trMult[nTrOut]=trMult[itr]; + trMult[nTrOut]->primaryFlag=false; + ++nTrOut; + }; + }; + if (nTrOut==0) { + outSAMfilterYes=false; + } else { + trMult[0]->primaryFlag=true; + }; + }; + }; + if (nTr>1) {//multimappers + unmapType=-1; + } else if (nTr==1) {//unique mappers + unmapType=-2; + } else {//cannot be + ostringstream errOut; + errOut << "EXITING because of a BUG: nTr=0 in outputAlignments.cpp"; + exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_BUG, P); + }; + + nTrOut=min(P.outSAMmultNmax,nTrOut); //number of to write to SAM/BAM files + //write to SAM/BAM + //printf("nTrOut %llu\n", nTrOut); + for (uint iTr=0;iTrexons[0][EX_iFrag]]=true; + mateMapped1[trMult[iTr]->exons[trMult[iTr]->nExons-1][EX_iFrag]]=true; + + if (P.outSAMbool && outSAMfilterYes) {//SAM output + //printf("samout\n"); + outBAMbytes+=outputTranscriptSAM(*(trMult[iTr]), nTr, iTr, (uint) -1, (uint) -1, 0, -1, NULL, &stream); + if (P.outSAMunmapped.keepPairs && readNmates>1 && ( !mateMapped1[0] || !mateMapped1[1] ) ) {//keep pairs && paired reads && one of the mates not mapped in this transcript + //printf("samout no null\n"); + outBAMbytes+= outputTranscriptSAM(*(trMult[iTr]), 0, 0, (uint) -1, (uint) -1, 0, 4, mateMapped1, &stream); + }; + }; + }; + + mateMapped[trBest->exons[0][EX_iFrag]]=true; + mateMapped[trBest->exons[trBest->nExons-1][EX_iFrag]]=true; + + if (readNmates>1 && !(mateMapped[0] && mateMapped[1]) ) { + unmapType=4; + }; + + + if (unmapType==4 && P.outSAMunmapped.yes) {//output unmapped end for single-end alignments + if (P.outSAMbool && !P.outSAMunmapped.keepPairs && outSAMfilterYes) { + outBAMbytes+= outputTranscriptSAM(*trBest, 0, 0, (uint) -1, (uint) -1, 0, unmapType, mateMapped, &stream); + }; + }; + + /* + if (P.outSJfilterReads=="All" || nTr==1) { + chunkOutSJ=new OutSJ (P.limitOutSJcollapsed, P, mapGen); + uint sjReadStartN=chunkOutSJ->N; + for (uint iTr=0;iTr=0 && unmapType<4 ) {//output unmapped within && unmapped read && both mates unmapped + if (P.outSAMbool) {//output SAM + outBAMbytes+= outputTranscriptSAM(*trBest, 0, 0, (uint) -1, (uint) -1, 0, unmapType, mateMapped, &stream); + //printf("how about here?\n"); + }; + }; + + std::stringbuf * pbuf = stream.rdbuf(); + std::streamsize size = pbuf->pubseekoff(0,stream.end); + pbuf->pubseekoff(0,stream.beg); + char *contents = (char*)malloc(size+1); + pbuf->sgetn (contents,size); + contents[size] = '\0'; + + return contents; +}; + + + diff --git a/star-sys/STAR/source/ReadAlign_outputTranscriptCIGARp.cpp b/star-sys/STAR/source/ReadAlign_outputTranscriptCIGARp.cpp new file mode 100644 index 0000000..f884d1d --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_outputTranscriptCIGARp.cpp @@ -0,0 +1,65 @@ +#include "ReadAlign.h" +#include "SequenceFuns.h" + +string ReadAlign::outputTranscriptCIGARp(Transcript const &trOut) {//generates CIGARp string for the transcript + //p is a special CIGAR operation to encode gap between mates. This gap is negative for overlapping mates + + string CIGAR; + samStreamCIGAR.str(std::string()); + + uint leftMate=0; + if (P.readFilesIn.size()>1) leftMate=trOut.Str; + + uint trimL=trOut.exons[0][EX_R] - (trOut.exons[0][EX_R]0) { + samStreamCIGAR << trimL << "S"; //initial trimming + }; + + for (uint ii=0;ii0) {//record gaps + uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); + + if (trOut.exons[ii][EX_G] >= (trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]) ) {// + if (trOut.canonSJ[ii-1]==-3) {//gap between mates + //soft clipping of the second mate + uint s1=readLengthOriginal[leftMate]-(trOut.exons[ii-1][EX_R]+trOut.exons[ii-1][EX_L]); + uint s2=trOut.exons[ii][EX_R]-(readLengthOriginal[leftMate]+1); + if (s1>0){ + samStreamCIGAR << s1 << "S"; + }; + samStreamCIGAR << gapG << "p"; + if (s2>0){ + samStreamCIGAR << s2 << "S"; + }; + + } else { + //it's possible to have a D or N and I for at the same time + uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; //gapR>0 always + if (gapR>0){ + samStreamCIGAR << gapR << "I"; + }; + if (trOut.canonSJ[ii-1]>=0 || trOut.sjAnnot[ii-1]==1) {//junction: N + samStreamCIGAR << gapG << "N"; + } else if (gapG>0) {//deletion + samStreamCIGAR << gapG << "D"; + }; + }; + } else {//mates overlap + samStreamCIGAR << "-" << (trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]) - trOut.exons[ii][EX_G] << "p"; + }; + }; + samStreamCIGAR << trOut.exons[ii][EX_L] << "M"; + }; + + + trimL=(trOut.exons[trOut.nExons-1][EX_R] 0 ) { + samStreamCIGAR << trimL << "S"; //final trimming + }; + CIGAR=samStreamCIGAR.str(); + + return CIGAR; + + + +}; diff --git a/star-sys/STAR/source/ReadAlign_outputTranscriptSAM.cpp b/star-sys/STAR/source/ReadAlign_outputTranscriptSAM.cpp new file mode 100644 index 0000000..2ddd13c --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_outputTranscriptSAM.cpp @@ -0,0 +1,343 @@ +#include "ReadAlign.h" +#include "SequenceFuns.h" +#include "ErrorWarning.h" + +uint ReadAlign::outputTranscriptSAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint mateChr, uint mateStart, char mateStrand, int unmapType, bool *mateMapped, ostream *outStream) { + + if (P.outSAMmode=="None") return 0; //no SAM output + + uint outStreamPos0=(uint)outStream->tellp(); + + if (unmapType>=0) + {//unmapped reads: SAM + //printf("not a map %llu\n", readNmates); + for (uint imate=0;imatetellp()-outStreamPos0; + };//if (unmapType>=0 && outStream != NULL) //unmapped reads: SAM + + + bool flagPaired = readNmates==2; + string CIGAR; + + //for SAM output need to split mates + uint iExMate; //last exon of the first mate + uint nMates=1; + for (iExMate=0;iExMatemapGen.nChrReal) samFlagCommon+=0x0008; //not mapped as pair + } else + {//paired align + if (P.alignEndsProtrude.concordantPair || \ + ( (trOut.exons[0][EX_G] <= trOut.exons[iExMate+1][EX_G]+trOut.exons[0][EX_R]) && \ + (trOut.exons[iExMate][EX_G]+trOut.exons[iExMate][EX_L] <= trOut.exons[trOut.nExons-1][EX_G]+Lread-trOut.exons[trOut.nExons-1][EX_R]) ) ) + {//properly paired + samFlagCommon+=0x0002; + }; + }; + } else + {//single end + samFlagCommon=0; + }; + + if (readFilter=='Y') samFlagCommon+=0x200; //not passing quality control + + uint Str= trOut.Str;//note that Strand = the mate on the left + uint leftMate=0; //the mate (0 or 1) which is on the left + if (flagPaired) { + leftMate=Str; + }; + + if (P.outSAMattrPresent.MC) { + calcCIGAR(trOut, nMates, iExMate, leftMate); + }; + + uint samFLAG; + + for (uint imate=0;imate0) { + samStreamCIGAR << trimL1 << "S"; //initial trimming + }; + + for (uint ii=iEx1;ii<=iEx2;ii++) { + if (ii>iEx1) {//record gaps + uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); + uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; + //it's possible to have a D or N and I at the same time + if (gapR>0){ + samStreamCIGAR << gapR; + samStreamCIGAR << "I"; + }; + if (trOut.canonSJ[ii-1]>=0 || trOut.sjAnnot[ii-1]==1) {//junction: N + samStreamCIGAR << gapG; + samStreamCIGAR << "N"; + samStreamSJmotif <<','<< trOut.canonSJ[ii-1] + (trOut.sjAnnot[ii-1]==0 ? 0 : SJ_SAM_AnnotatedMotifShift); //record junction type +// samStreamSJannot <<','<< (int) trOut.sjAnnot[ii-1]; //record annotation type + samStreamSJintron <<','<< trOut.exons[ii-1][EX_G] + trOut.exons[ii-1][EX_L] + 1 - mapGen.chrStart[trOut.Chr] <<','\ + << trOut.exons[ii][EX_G] - mapGen.chrStart[trOut.Chr]; //record intron loci + } else if (gapG>0) {//deletion: N + samStreamCIGAR << gapG; + samStreamCIGAR << "D"; + }; + }; + samStreamCIGAR << trOut.exons[ii][EX_L] << "M"; + }; + + string SJmotif = samStreamSJmotif.str(); + string SJintron = samStreamSJintron.str(); +// string SJannot = samStreamSJannot.str(); + + if (SJmotif.length()==0) {//no junctions recorded, mark with -1 + SJmotif=",-1"; + SJintron=",-1"; +// SJannot=",-1"; + }; + + //printf("weird trimming time %llu %llu %llu %llu %llu %llu\n", trOut.exons[iEx1][EX_R], readLength[leftMate], readLengthOriginal[Mate], trOut.exons[iEx2][EX_R], trOut.exons[iEx2][EX_L], trimL); + uint trimR1=(trOut.exons[iEx1][EX_R] 0 ) { + samStreamCIGAR << trimR1 << "S"; //final trimming + }; + CIGAR=samStreamCIGAR.str(); + + + char seqMate[DEF_readSeqLengthMax+1], qualMate[DEF_readSeqLengthMax+1]; + char *seqOut=NULL, *qualOut=NULL; + + if ( Mate==Str ) {//seq strand is correct + seqOut=Read0[Mate]; + qualOut=Qual0[Mate]; + } else { + revComplementNucleotides(Read0[Mate], seqMate, readLengthOriginal[Mate]); + seqMate[readLengthOriginal[Mate]]=0; + for (uint ii=0;ii=5) { + MAPQ=0; + } else if (nTrOut>=3) { + MAPQ=1; + } else if (nTrOut==2) { + MAPQ=3; + }; + + *outStream << readName+1 <<"\t"<< ((samFLAG & P.outSAMflagAND) | P.outSAMflagOR) <<"\t"<< mapGen.chrName[trOut.Chr] <<"\t"<< trOut.exons[iEx1][EX_G] + 1 - mapGen.chrStart[trOut.Chr] + <<"\t"<< MAPQ <<"\t"<< CIGAR; + + if (nMates>1) { + *outStream <<"\t"<< "=" <<"\t"<< trOut.exons[(imate==0 ? iExMate+1 : 0)][EX_G]+ 1 - mapGen.chrStart[trOut.Chr] + <<"\t"<< (imate==0? "":"-") << trOut.exons[trOut.nExons-1][EX_G]+trOut.exons[trOut.nExons-1][EX_L]-trOut.exons[0][EX_G]; + } else if (mateChr customAttr(outSAMattrN,""); + + uint tagNM=0; + string tagMD(""); + if (P.outSAMattrPresent.NM || P.outSAMattrPresent.MD) { + char* R=Read1[trOut.roStr==0 ? 0:2]; + uint matchN=0; + for (uint iex=iEx1;iex<=iEx2;iex++) { + for (uint ii=0;ii0 || (ii==0 && iex>0 && trOut.canonSJ[iex]==-1) ) { + tagMD+=to_string(matchN); +// }; + tagMD+=P.genomeNumToNT[(uint8) g1]; + matchN=0; + } else { + matchN++; + }; + }; + if (iex1) { + *outStream<< "\tMC:Z:" <logMain, EXIT_CODE_PARAMETER, P); + }; + }; + + if (P.readFilesTypeN==10 && !readNameExtra[imate].empty()) {//SAM files as input - output extra attributes + *outStream << "\t" << readNameExtra.at(imate); + }; + + *outStream << "\n"; //done with one SAM line + };//for (uint imate=0;imatetellp()-outStreamPos0; +}; diff --git a/star-sys/STAR/source/ReadAlign_outputTranscriptSJ.cpp b/star-sys/STAR/source/ReadAlign_outputTranscriptSJ.cpp new file mode 100644 index 0000000..05179e8 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_outputTranscriptSJ.cpp @@ -0,0 +1,54 @@ +#include "ReadAlign.h" +#include "OutSJ.h" + +void ReadAlign::outputTranscriptSJ(Transcript const &trOut, uint nTrOut, OutSJ *chunkOutSJ, uint sjReadStartN ) {//record junctions in chunkOutSJ array + + //TODO: make sure that a junction is recorded onyl once from one read. + //For a multimapper, several alignments may contain the same junctions - now it's recorded several time. +// if (nTrOut>1) return; //junctions from multi-mappers are not recorded + +// if (P.outSAMmode=="None") return; //no SAM output + + for (uint iex=0;iex=0) {//only record junctions, not indels or mate gap + chunkOutSJ->oneSJ.junctionPointer(chunkOutSJ->data, chunkOutSJ->N);//get pointer to an empty junction in the data array + *chunkOutSJ->oneSJ.start=trOut.exons[iex][EX_G]+trOut.exons[iex][EX_L]; //start of the intron + *chunkOutSJ->oneSJ.gap=trOut.exons[iex+1][EX_G]-*chunkOutSJ->oneSJ.start; + //overhangs: basic method + //*chunkOutSJ->oneSJ.overhangLeft = (uint32) trOut.exons[iex][EX_L];//TODO calculate the lengh of overhangs taking into account indels + //*chunkOutSJ->oneSJ.overhangRight = (uint32) trOut.exons[iex+1][EX_L]; + //overhangs: min method + *chunkOutSJ->oneSJ.overhangLeft = min ( (uint32) trOut.exons[iex][EX_L],(uint32) trOut.exons[iex+1][EX_L] ); + *chunkOutSJ->oneSJ.overhangRight = *chunkOutSJ->oneSJ.overhangLeft; + + //check if this junction has been recorded from this read - this happens when the mates overlap and cross the same junctions + bool duplicateSJ(false); + for (uint ii=sjReadStartN; iiN; ii++) {//TODO if there are many junctions, need to make more efficient + if ( *chunkOutSJ->oneSJ.start == *((uint*) (chunkOutSJ->data+ii*Junction::dataSize+Junction::startP)) \ + && *chunkOutSJ->oneSJ.gap == *((uint32*) (chunkOutSJ->data+ii*Junction::dataSize+Junction::gapP)) ) { + duplicateSJ=true; + uint16* overhang1=(uint16*) (chunkOutSJ->data+ii*Junction::dataSize+Junction::overhangLeftP); + if (*overhang1<*chunkOutSJ->oneSJ.overhangLeft) { + *overhang1=*chunkOutSJ->oneSJ.overhangLeft; + * ((uint16*) (chunkOutSJ->data+ii*Junction::dataSize+Junction::overhangRightP))=*overhang1; + }; + break; + }; + }; + if (duplicateSJ) continue; //do not record this junctions + + *chunkOutSJ->oneSJ.motif=trOut.canonSJ[iex]; + *chunkOutSJ->oneSJ.strand=(char) (trOut.canonSJ[iex]==0 ? 0 : (trOut.canonSJ[iex]+1)%2+1); + *chunkOutSJ->oneSJ.annot=trOut.sjAnnot[iex]; + if (nTrOut==1) { + *chunkOutSJ->oneSJ.countUnique=1; + *chunkOutSJ->oneSJ.countMultiple=0; + } else { + *chunkOutSJ->oneSJ.countMultiple=1; //TODO: 1/nTrOut? + *chunkOutSJ->oneSJ.countUnique=0; //TODO: 1/nTrOut? + }; + + chunkOutSJ->N++;//increment the number of recorded junctions + }; + }; +}; diff --git a/star-sys/STAR/source/ReadAlign_outputVariation.cpp b/star-sys/STAR/source/ReadAlign_outputVariation.cpp new file mode 100644 index 0000000..3e516f9 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_outputVariation.cpp @@ -0,0 +1,13 @@ +#include "ReadAlign.h" + +void ReadAlign::outputVariation(Variation &Var, Transcript Tr, uint iTr, uint nTr) +{ + if (!Var.yes) + { + return; + }; + + + + +}; diff --git a/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp b/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp new file mode 100644 index 0000000..5dbe410 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp @@ -0,0 +1,297 @@ +#include "ReadAlign.h" +#include "SequenceFuns.h" + +void ReadAlign::peOverlapMergeMap() { + + if (!P.peOverlap.yes || P.readNmates!=2 ) {//no peOverlap + peOv.yes=false; + return; + }; + + //debug + //cout << ">" << readName+1; + + + //merge PE mates into SE + peMergeRA->copyRead(*this); + peMergeRA->peMergeMates(); + peOv=peMergeRA->peOv; + peOv.yes=false; + + if (peOv.nOv==0) {//check if mates can be merged, if not - return + //cout <<"\n-1\n"; + return; + }; + + //change parameters for SE mapping + //double P_alignSplicedMateMapLminOverLmate=P.alignSplicedMateMapLminOverLmate; + //P.alignSplicedMateMapLminOverLmate=P.alignSplicedMateMapLminOverLmate*peMergeRA->readLength[0]/(readLength[0]+readLength[1]); + + //map SE + peMergeRA->mapOneRead(); + if (peMergeRA->nW==0) { // || peMergeRA->trBest->maxScore+peOv.nOv < trBest->maxScore) {//no windows, score of the merged align is less. This is a preliminary check, more accurate check is done with alignment score calculated after transforming the SE back to PE + //cout <<" -2\n"; + //for (uint ii=0;iiLread;ii++) { + // cout <Read1[0][ii]]; + //}; + //cout << "\n"; + return; + }; + + //convert best alignment SE to PE + //trA=*trInit; + //trA.peOverlapSEtoPE(peOv.nOv, *peMergeRA->trBest); + //trA.alignScore(Read1,mapGen.G,P); + //if (trA.maxScoremaxScore || trA.nMM > outFilterMismatchNmaxTotal) {//merged-mate SE alignment has lower score than the PE + // return; + //}; + + intScore peScore=trBest->maxScore; + + //convert SE to PE *this ReadAlign + peMergeRA->peOv=peOv; + peOverlapSEtoPE(*peMergeRA); + + //debug + //if (oldScore>trBest->maxScore || trBest->maxScoretrBest->maxScore) + // cout << readName << " "<< oldScore << " "<< peMergeRA->trBest->maxScore << " "<maxScore << endl; + + + //chimeric detection for SE + chimericDetectionPEmerged(*peMergeRA); + + //debug + //cout << "\n"; + //for (uint ii=0;iiLread;ii++) { + // cout <Read1[0][ii]]; + //}; + //cout << "\n"; + + //P.alignSplicedMateMapLminOverLmate=P_alignSplicedMateMapLminOverLmate; + + if (peScore<=trBest->maxScore || chimRecord) {//otherwise peOv.yes=false + peOv.yes=true; + }; + + return; +}; + +void ReadAlign::peMergeMates() { + + uint s1=localSearchNisMM(Read1[0],readLength[0],Read1[0]+readLength[0]+1,readLength[1],P.peOverlap.MMp); + uint s0=localSearchNisMM(Read1[0]+readLength[0]+1,readLength[1],Read1[0],readLength[0],P.peOverlap.MMp); + + uint o1=min(readLength[1],readLength[0]-s1); + uint o0=min(readLength[0],readLength[1]-s0); + + peOv.nOv=max(o0,o1); + + if (peOv.nOv=o0) { + peOv.mateStart[0]=0; + peOv.mateStart[1]=s1; + if (o1= mEnd[0] || t.exons[iex][EX_R]+t.exons[iex][EX_L] < mSta[0]) {//this exon is only in mate2, break this cycle +// break; +// }; +// //record these exons for mate1 +// +// exons[iex][EX_iFrag]=t.Str; +// exons[iex][EX_sjA]=t.exons[iex][EX_sjA]; +// canonSJ[iex]=t.canonSJ[iex]; +// sjAnnot[iex]=t.sjAnnot[iex]; +// sjStr[iex]=t.sjStr[iex]; +// shiftSJ[iex][0]=t.shiftSJ[iex][0]; +// shiftSJ[iex][1]=t.shiftSJ[iex][1]; +// +// exons[iex][EX_R]=t.exons[iex][EX_R]-mSta[0]; +// exons[iex][EX_G]=t.exons[iex][EX_G]; +// if (t.exons[iex][EX_R]+t.exons[iex][EX_L] < mEnd[0]) {//exon is fully in mate1 +// exons[iex][EX_L]=t.exons[iex][EX_L]; +// } else { +// exons[iex][EX_L]=mEnd[0]-t.exons[iex][EX_R]; +// }; +// }; + + nExons=0; + for (uint imate=0; imate<2; imate++) {//cycle over mate 1,2 + for (uint iex=0; iex= mEnd[imate] || t.exons[iex][EX_R]+t.exons[iex][EX_L] <= mSta[imate]) {//this exon is only in mate2, do not record here + continue; + }; + + exons[nExons][EX_iFrag]=(imate==0 ? t.Str : 1-t.Str); + exons[nExons][EX_sjA]=t.exons[iex][EX_sjA]; + if (iex=mSta[imate]) {//exon left is inside the mate + exons[nExons][EX_G]=t.exons[iex][EX_G]; + exons[nExons][EX_L]=t.exons[iex][EX_L]; + exons[nExons][EX_R]=t.exons[iex][EX_R]-mSta[imate]+mSta2[imate]; + } else {//need to split the exon + exons[nExons][EX_R]=mSta2[imate];//exon starts at the mate start + uint delta=mSta[imate]-t.exons[iex][EX_R]; //shorten exon by this length + exons[nExons][EX_L]=t.exons[iex][EX_L]-delta; + exons[nExons][EX_G]=t.exons[iex][EX_G]+delta; + }; + + if (t.exons[iex][EX_R]+t.exons[iex][EX_L] > mEnd[imate]) {//exon right is to the left of the mate end, shorten the exon + exons[nExons][EX_L]-=t.exons[iex][EX_R]+t.exons[iex][EX_L]-mEnd[imate]; + }; + + ++nExons; + }; + canonSJ[nExons-1]=-3; //marks "junction" between mates + sjAnnot[nExons-1]=0; + sjStr[nExons-1]=0; + shiftSJ[nExons-1][0]=0; + shiftSJ[nExons-1][1]=0; + }; + + //copy scalar variables + for (uint ii=0;ii<3;ii++) { + intronMotifs[ii]=t.intronMotifs[ii]; + }; + sjMotifStrand=t.sjMotifStrand; + //iFrag; //do not need it + Chr=t.Chr; + Str=t.Str; + roStr=t.roStr; + gStart=t.gStart; + gLength=t.gLength; + cStart=t.cStart; + + rLength=0; + for (uint iex=0;iexpeOverlapSEtoPE(peOv.mateStart, *seRA.trAll[iW][iTr]); + trAll[iW][iTr]->alignScore(Read1,mapGen.G,P); + if (trAll[iW][iTr]->maxScore > trAll[iW][0]->maxScore) { + swap(trAll[iW][iTr],trAll[iW][0]); + }; + }; + if (trAll[iW][0]->maxScore>bestScore) { + trBest=trAll[iW][0]; + bestScore=trBest->maxScore; + }; + }; + + return; +}; diff --git a/star-sys/STAR/source/ReadAlign_quantTranscriptome.cpp b/star-sys/STAR/source/ReadAlign_quantTranscriptome.cpp new file mode 100644 index 0000000..ef007f7 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_quantTranscriptome.cpp @@ -0,0 +1,72 @@ +#include "Transcriptome.h" +#include "ReadAlign.h" +#include "Transcript.h" +#include "serviceFuns.cpp" +#include + +uint ReadAlign::quantTranscriptome (Transcriptome *Tr, uint nAlignG, Transcript **alignG, Transcript *alignT, vector &readTranscripts, set &readTrGenes) { + uint nAlignT=0; + for (uint iag=0; iagnDel>0 || alignG[iag]->nIns>0) ) {//prevent indels if requested + continue; + }; + if (!P.quant.trSAM.singleEnd && (P.readNmates==2 && alignG[iag]->exons[0][EX_iFrag]==alignG[iag]->exons[alignG[iag]->nExons-1][EX_iFrag]) ) + {//prevent single end alignments + continue; + }; + + uint nMM1=0; + char* R=Read1[alignG[iag]->roStr==0 ? 0:2]; + if (!P.quant.trSAM.softClip) {//soft clipping not allowed, extend them if possible + for (uint32 iab=0; iabnExons; iab++) { + uint left1=0,right1=0;//how many bases to move left or right + if (iab==0) { + left1=alignG[iag]->exons[iab][EX_R]; + } else if (alignG[iag]->canonSJ[iab-1]==-3) { + left1=alignG[iag]->exons[iab][EX_R]-readLength[alignG[iag]->exons[iab-1][EX_iFrag]]-1; + }; + if (iab==alignG[iag]->nExons-1) {//last block of left mates + right1=Lread-alignG[iag]->exons[iab][EX_R]-alignG[iag]->exons[iab][EX_L]; + + } else if (alignG[iag]->canonSJ[iab]==-3) {//last block of the right mate (i.e. whole read) + right1=readLength[alignG[iag]->exons[iab][EX_iFrag]]-alignG[iag]->exons[iab][EX_R]-alignG[iag]->exons[iab][EX_L]; + }; + + for (uint b=1; b<=left1 ; b++) {//extend to the left + char r1=R[alignG[iag]->exons[iab][EX_R]-b]; + char g1=mapGen.G[alignG[iag]->exons[iab][EX_G]-b]; + if ( r1!=g1 && r1<4 && g1<4) ++nMM1; + }; + for (uint b=0; bexons[iab][EX_R]+alignG[iag]->exons[iab][EX_L]+b]; + char g1=mapGen.G[alignG[iag]->exons[iab][EX_G]+alignG[iag]->exons[iab][EX_L]+b]; + if ( r1!=g1 && r1<4 && g1<4) ++nMM1; + }; + alignG[iag]->exons[iab][EX_R] -= left1; + alignG[iag]->exons[iab][EX_G] -= left1; + alignG[iag]->exons[iab][EX_L] += left1+right1; + }; + + if ( (alignG[iag]->nMM + nMM1) > min(outFilterMismatchNmaxTotal, (uint) (P.outFilterMismatchNoverLmax*(Lread-1)) ) ) { + //extension of soft clips yielded too many mismatches, no output + continue; + }; + }; + + nAlignT += Tr->quantAlign(*alignG[iag],alignT+nAlignT, readTranscripts, readTrGenes); + }; + + //not used anymore, at Colin Dewey's request + // if (nAlignT==0 && P.outSAMunmapped=="Within") {//read could be mapped to genome, but not transcriptome - output as unmapped + // uint unmapType=5; + // bool mateMapped[2]={false,false}; + // alignBAM(*alignG[0], 0, 0, mapGen.chrStart[alignG[0]->Chr], (uint) -1, (uint) -1, 0, unmapType, mateMapped, P.outSAMattrOrder); + // for (uint imate=0; imateunsortedOneAlign(outBAMoneAlign[imate], outBAMoneAlignNbytes[imate], imate>0 ? 0 : outBAMoneAlignNbytes[0]+outBAMoneAlignNbytes[1]); + // }; + // + // }; + + return nAlignT; +}; diff --git a/star-sys/STAR/source/ReadAlign_stitchPieces.cpp b/star-sys/STAR/source/ReadAlign_stitchPieces.cpp new file mode 100644 index 0000000..e577d9f --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_stitchPieces.cpp @@ -0,0 +1,354 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ReadAlign.h" +#include "SequenceFuns.h" +#include "stitchWindowAligns.h" +#include "sjAlignSplit.cpp" +#include "PackedArray.h" +#include "alignSmithWaterman.h" +#include "GlobalVariables.h" +#include + +void ReadAlign::stitchPieces(char **R, uint Lread) { + + //zero-out winBin + memset(winBin[0],255,sizeof(winBin[0][0])*P.winBinN); + memset(winBin[1],255,sizeof(winBin[0][0])*P.winBinN); + +// for (uint iWin=0;iWin=readLength[PC[iP][PC_iFrag]] ) {//proceed if piece is an anchor, i.e. maps few times or is long enough + if (PC[iP][PC_Nrep]<=P.winAnchorMultimapNmax ) {//proceed if piece is an anchor, i.e. maps few times + + uint aDir = PC[iP][PC_Dir]; + uint aLength= PC[iP][PC_Length]; + + for (uint iSA=PC[iP][PC_SAstart]; iSA<=PC[iP][PC_SAend]; iSA++) {//scan through all alignments of this piece + // going through ordered positions in the suffix array from PC_SAstart to PC_SAend + uint a1 = mapGen.SA[iSA]; + //printf("a1 %llu\n", a1); + uint aStr = a1 >> mapGen.GstrandBit; + a1 &= mapGen.GstrandMask; //remove strand bit + + //convert to positive strand + if (aDir==1 && aStr==0) { + aStr=1; + } else if (aDir==0 && aStr==1) { + a1 = mapGen.nGenome - (aLength+a1); + } else if (aDir==1 && aStr==1) { + aStr=0; + a1 = mapGen.nGenome - (aLength+a1); + }; + //final strand + if (revertStrand) { //modified strand according to user input CHECK!!!! + aStr=1-aStr; + }; + + + if (a1>=mapGen.sjGstart) {//this is sj align + uint a1D, aLengthD, a1A, aLengthA, sj1; + if (sjAlignSplit(a1, aLength, mapGen, a1D, aLengthD, a1A, aLengthA, sj1)) {//align crosses the junction + + int addStatus=createExtendWindowsWithAlign(a1D, aStr);//add donor piece + if (addStatus==EXIT_createExtendWindowsWithAlign_TOO_MANY_WINDOWS) {//too many windows + break; + }; + addStatus=createExtendWindowsWithAlign(a1A, aStr);//add acceptor piece + if (addStatus==EXIT_createExtendWindowsWithAlign_TOO_MANY_WINDOWS) {//too many windows + break; + }; + }; + } else {//this is a normal genomic read + int addStatus=createExtendWindowsWithAlign(a1, aStr); + if (addStatus==EXIT_createExtendWindowsWithAlign_TOO_MANY_WINDOWS) {//too many windows + break; + }; + }; + }; //for (uint iSA=PC[iP][PC_SAstart]; iSA<=PC[iP][PC_SAend]; iSA++) //scan through all alignments of this piece + };//if (PC[iP][PC_Nrep]<=P.winAnchorMultimapNmax) //proceed if anchor + };//for (uint iP=0; iP0 && mapGen.chrBin[(wb-1) >> P.winBinChrNbits]==WC[iWin][WC_Chr];ii++) { + wb--; + winBin[ WC[iWin][WC_Str] ][ wb ]=(uintWinBin) iWin; + }; + WC[iWin][WC_gStart] = wb; + + wb=WC[iWin][WC_gEnd]; + for (uint ii=0; ii> P.winBinChrNbits]==WC[iWin][WC_Chr];ii++) { + wb++; + winBin[ WC[iWin][WC_Str] ][ wb ]=(uintWinBin) iWin; + }; + WC[iWin][WC_gEnd] = wb; + + + }; + nWA[iWin]=0; //initialize nWA + WALrec[iWin]=0; //initialize rec-length + WlastAnchor[iWin]=-1; + }; + + nWall=nW; + + #ifdef OFF_BEFORE_SEEDdistribution + #warning OFF_BEFORE_SEEDdistribution + nW=0; + nTr=0; + return; + #endif + + for (uint iP=0; iP> mapGen.GstrandBit; + a1 &= mapGen.GstrandMask; //remove strand bit + uint aRstart=PC[iP][PC_rStart]; + + //convert to positive strand + if (aDir==1 && aStr==0) { + aStr=1; + aRstart = Lread - (aLength+aRstart); + } else if (aDir==0 && aStr==1) { + aRstart = Lread - (aLength+aRstart); + a1 = mapGen.nGenome - (aLength+a1); + } else if (aDir==1 && aStr==1) { + aStr=0; + a1 = mapGen.nGenome - (aLength+a1); + }; + + //final strand + if (revertStrand) { //modified strand according to user input CHECK!!!! + aStr=1-aStr; + }; + + + if (a1>=mapGen.sjGstart) {//this is sj read + uint a1D, aLengthD, a1A, aLengthA, isj1; + if (sjAlignSplit(a1, aLength, mapGen, a1D, aLengthD, a1A, aLengthA, isj1)) {//align crosses the junction + + assignAlignToWindow(a1D, aLengthD, aStr, aNrep, aFrag, aRstart, aAnchor, isj1); + assignAlignToWindow(a1A, aLengthA, aStr, aNrep, aFrag, aRstart+aLengthD, aAnchor, isj1); + + } else {//align does not cross the junction + continue; //do not check this align, continue to the next one + }; + + } else {//this is a normal genomic read + assignAlignToWindow(a1, aLength, aStr, aNrep, aFrag, aRstart, aAnchor, -1); + }; + }; + + +// for (uint ii=0;iiP.seedNoneLociPerWindow) nWA[ii] -= nWAP[ii]; +// }; + }; + + //TODO remove windows that have too many alignments + //aligns are still sorted by original read coordinates, change direction for negative strand + // DOES NOT HELP!!! +// for ( uint iW=0;iWWA[iW][nWA[iW]-1][WA_rStart]) {//swap +// for (uint iA=0;iA0) { + //select good windows by coverage + uint rLast=0; + + for (uint ia=0; iarLast+1) { + if (r1>rLast) { + swWinCov[iW] += L1; + } else { + swWinCov[iW] += r1+L1-(rLast+1); + }; + rLast=r1+L1-1; + }; + };//for (uint ia=0; iaswWinCovMax) swWinCovMax=swWinCov[iW]; + };//if (nWA[iW]>0) +};//for (uint iW=0;iW= P.alignTranscriptsPerReadNmax) { + P.inOut->logMain << "WARNING: not enough space allocated for transcript. Did not process all windows for read "<< readName+1 <logMain <<" SOLUTION: increase alignTranscriptsPerReadNmax and re-run\n" << flush; + break; + }; + //printf("trA %llu\n", trA.Chr); + *(trAll[iW1][0])=trA; + nWinTr[iW1]=0; //initialize number of transcripts per window + + + #ifdef COMPILE_FOR_LONG_READS + stitchWindowSeeds(iW, iW1, NULL, R[trA.roStr==0 ? 0:2]); + if (P.pCh.segmentMin>0) { + for (uint ia=0;ia trA.exons[iex][EX_R] && \ + WA[iW][ia][WA_gStart] < (trA.exons[iex][EX_G]+trA.exons[iex][EX_L]) && \ + (WA[iW][ia][WA_gStart]+WA[iW][ia][WA_Length]) > trA.exons[iex][EX_G] ) + { + WAincl[ia]=true; + break; + }; + + }; + }; + stitchWindowSeeds(iW, iW1, WAincl, R[trA.roStr==0 ? 0:2]); + }; + #else + stitchWindowAligns(0, nWA[iW], 0, WAincl, 0, 0, trA, Lread, WA[iW], R[trA.roStr==0 ? 0:2], mapGen, P, trAll[iW1], nWinTr+iW1, this); + #endif + if (nWinTr[iW1]==0) { + continue; + }; + + if (trAll[iW1][0]->maxScore > trBest->maxScore || (trAll[iW1][0]->maxScore == trBest->maxScore && trAll[iW1][0]->gLength < trBest->gLength ) ) { + trBest=trAll[iW1][0]; + }; + //printf("why not add here %llu\n", nWinTr[iW1]); + trNtotal += nWinTr[iW1]; + iW1++; + }; + + nW=iW1;//only count windows that had alignments + +// {//debug +// std::time(&timeFinish); +// double timeDiff=difftime(timeFinish,timeStart); +// cout << " "<< timeDiff << " "<maxScore*100/Lread<<" "<maxScore==0) {//no window was aligned (could happen if for all windows too many reads are multiples) + mapMarker = MARKER_NO_GOOD_WINDOW; + nW=0; + return; + }; + +};//end of function diff --git a/star-sys/STAR/source/ReadAlign_stitchWindowSeeds.cpp b/star-sys/STAR/source/ReadAlign_stitchWindowSeeds.cpp new file mode 100755 index 0000000..ddd3392 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_stitchWindowSeeds.cpp @@ -0,0 +1,278 @@ +#include + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ReadAlign.h" +#include "stitchAlignToTranscript.h" +#include "extendAlign.h" +#include "binarySearch2.h" +#include "ErrorWarning.h" + +void ReadAlign::stitchWindowSeeds (uint iW, uint iWrec, bool *WAexcl, char *R) {//stitches all seeds in one window: iW + + for (uint iS1=0;iS11) + {//junctions have to be present in the filtered set P.sjnovel + uint iex=0; + if (trA1.canonSJ[iex]>=0 && trA1.sjAnnot[iex]==0) + { + uint jS=trA1.exons[iex][EX_G]+trA1.exons[iex][EX_L]; + uint jE=trA1.exons[iex+1][EX_G]-1; + if ( binarySearch2(jS,jE,P.sjNovelStart,P.sjNovelEnd,P.sjNovelN) < 0 ) return; + }; + }; + + //check the length of the iS2 exon. TODO: build the transcripts vs iS1, check the actual exon length + bool exonLongEnough = trA1.exons[0][EX_L] >= ( trA1.sjAnnot[0]==0 ? P.alignSJoverhangMin : P.alignSJDBoverhangMin ); + + if (exonLongEnough && score2>0 && score2+scoreSeedBest[iS2] > scoreSeedBest[iS1] ) { + scoreSeedBest[iS1]=score2+scoreSeedBest[iS2]; + scoreSeedBestMM[iS1]=trA1.nMM; + scoreSeedBestInd[iS1]=iS2; + }; + } else {//extend to the left + score2=WA[iW][iS1][WA_Length]; + if ( WA[iW][iS1][WA_rStart]>0 \ + && extendAlign(R, mapGen.G, WA[iW][iS1][WA_rStart]-1, WA[iW][iS1][WA_gStart]-1, -1, -1, WA[iW][iS1][WA_rStart], 100000, 0, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[WA[iW][iS1][WA_iFrag]][trA.Str], &trA1) ) {//if could extend + score2 += trA1.maxScore; + }; + + bool exonLongEnough = (WA[iW][iS1][WA_Length]+trA1.extendL) >= P.alignSJoverhangMin; //TODO new parameter to control end exons length + + if (exonLongEnough && score2 > scoreSeedBest[iS1] ) { + scoreSeedBest[iS1]=score2; + scoreSeedBestInd[iS1]=iS1; +// scoreSeedBestMM[iS1]=trA1.nMM; + }; + }; + }; + }; + + intScore scoreBest=0; + uint scoreBestInd=0; + + + for (uint iS1=0;iS1= P.alignSJoverhangMin; //TODO new parameter to control end exons length + + if (exonLongEnough && scoreSeedBest[iS1]>scoreBest) {//record new best transcript + scoreBest=scoreSeedBest[iS1]; + scoreBestInd=iS1; + }; + }; + + uint seedN=0; + while (true) {//construct the sequence of seeds + seedChain[seedN++]=scoreBestInd; + WAincl[scoreBestInd]=true; + if (scoreBestInd>scoreSeedBestInd[scoreBestInd]){//keep going + scoreBestInd=scoreSeedBestInd[scoreBestInd]; + } else {//this seed is the first one + break; + }; + }; + + int Score=0; + {//build final transcript form seedChain + {//initiate transcript + + uint iS1=seedChain[seedN-1]; + Score= WA[iW][iS1][WA_Length]; + trA.maxScore = Score; + trA.nMatch = WA[iW][iS1][WA_Length]; //# of matches + trA.nMM = 0; + + trA.exons[0][EX_R] = trA.rStart = WA[iW][iS1][WA_rStart]; + trA.exons[0][EX_G] = trA.gStart = WA[iW][iS1][WA_gStart]; + trA.exons[0][EX_L] = WA[iW][iS1][WA_Length]; + trA.exons[0][EX_iFrag]=WA[iW][iS1][WA_iFrag]; + trA.exons[0][EX_sjA]=WA[iW][iS1][WA_sjA]; + + trA.nExons=1; + + }; + + for (uint iSc=seedN-1; iSc>0; iSc--) {//stitch seeds from the chain + uint iS1=seedChain[iSc], iS2=seedChain[iSc-1]; + int scoreStitch= stitchAlignToTranscript(WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]-1, WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]-1,\ + WA[iW][iS2][WA_rStart], WA[iW][iS2][WA_gStart], WA[iW][iS2][WA_Length], WA[iW][iS2][WA_iFrag], WA[iW][iS2][WA_sjA], \ + P, R, mapGen, &trA, outFilterMismatchNmaxTotal); +// if (scoreStitch>0) { + Score+=scoreStitch; +// } else { +// cout <<"BUG"<0 \ + && extendAlign(R, mapGen.G, trA.exons[0][EX_R]-1, trA.exons[0][EX_G]-1, -1, -1, trA.exons[0][EX_R], 100000, 0, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, + P.alignEndsType.ext[trA.exons[0][EX_iFrag]][trA.Str], &trA1) ) {//if could extend + + trA.add(&trA1); + + trA.exons[0][EX_R] -= trA1.extendL; + trA.exons[0][EX_G] -= trA1.extendL; + trA.exons[0][EX_L] += trA1.extendL; + trA.rStart = trA.exons[0][EX_R]; + trA.gStart = trA.exons[0][EX_G]; + }; + }; + + {//extend to the right + uint iS1=seedChain[0]; + trA1=*trInit;//initialize trA1 + uint tR2=WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]; + uint tG2=WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]; + if ( tR2 < Lread \ + && extendAlign(R, mapGen.G, tR2, tG2, +1, +1, Lread-tR2, 100000, scoreSeedBestMM[iS1], outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[trA.exons[trA.nExons-1][EX_iFrag]][1-trA.Str], &trA1) ) {//if could extend + trA.add(&trA1); + trA.exons[trA.nExons-1][EX_L] += trA1.extendL;//extend the length of the last exon + }; + }; + + }; + + //debug: recalculate the number of MM +// { +// uint nMM1=0; +// for (uint iex=0;iex=0) + {//junctions - others are indels + sjN++; + trA.intronMotifs[trA.sjStr[iex]]++; + }; + }; + + if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]==0) + trA.sjMotifStrand=1; + else if (trA.intronMotifs[1]==0 && trA.intronMotifs[2]>0) + trA.sjMotifStrand=2; + else + trA.sjMotifStrand=0; + + if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0 && P.outFilterIntronStrands=="RemoveInconsistentStrands") + return; + + if (sjN>0 && trA.sjMotifStrand==0 && P.outSAMstrandField.type==1) {//strand not defined for a junction + return; + }; + + if (P.outFilterIntronMotifs=="None") {//no filtering + + } else if (P.outFilterIntronMotifs=="RemoveNoncanonical") { + for (uint iex=0;iexlogMain, EXIT_CODE_INPUT_FILES, P); + }; + +// if (P.outFilterIntronMotifs=="KeepCanonical" && (trA.intronMotifs[0]>0 || (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0) ) ) {//keep only conistent canonical introns +// return; +// }; + + + //check exons lengths including repeats, do not report a transcript with short exons +// for (uint isj=0;isj=0 && +// ( trA.exons[isj][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][0] +// || trA.exons[isj+1][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][1]) ) { +// return;//do not record this transcript in wTr +// }; +// }; + }; + + if (WAexcl==NULL) + {//record the transcript TODO: allow for multiple transcripts in one window + *(trAll[iWrec][0])=trA; + nWinTr[iWrec]=1; + } else + {//record 2nd best alignment in this window + *(trAll[iWrec][1])=trA; + nWinTr[iWrec]=2; + }; +}; diff --git a/star-sys/STAR/source/ReadAlign_storeAligns.cpp b/star-sys/STAR/source/ReadAlign_storeAligns.cpp new file mode 100644 index 0000000..1aaa59e --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_storeAligns.cpp @@ -0,0 +1,160 @@ +/** ReadAlign - one read, all alignments + */ + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "ReadAlign.h" +#include "ErrorWarning.h" + +void ReadAlign::storeAligns (uint iDir, uint Shift, uint Nrep, uint L, uint indStartEnd[2], uint iFrag) {//fill in alignment data + + #ifdef OFF_BEFORE_STORE + #warning OFF_BEFORE_STORE + return; + #endif + + if ( Nrep > P.seedMultimapNmax ) {// if a piece maps too many times, do not store it + if ( Nrep < multNmin || multNmin==0 ) {multNmin=Nrep; multNminL=L;}; + return; + }; + + nUM[ Nrep==1 ? 0:1] += Nrep; //add numbers of U/M aligns + nA += Nrep; + + uint rStart=iDir==0 ? Shift : Shift+1-L;//alignment read-start + + #define OPTIM_STOREaligns_SIMPLE + #ifdef OPTIM_STOREaligns_SIMPLE + //find the place to insert the new entry to keep it sorted + int iP; + for (iP=nP-1; iP>=0; iP--) { + if ( PC[iP][0]<=rStart ) { + if ( (PC[iP][PC_rStart]==rStart) && PC[iP][PC_Length]=iP;ii--) {//move old entries to free space for the new one + for (int jj=0;jj P.seedPerReadNmax) { + ostringstream errOut; + errOut <<"EXITING because of FATAL error: too many pieces pere read\n" ; + errOut <<"SOLUTION: increase input parameter --seedPerReadNmax"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_RUNTIME, P); + }; + #else +// int iP3; +// for (iP3=nP-1; iP3>=0; iP3--) { +// if ( PC[iP3][0]<=rStart ) { +// if ( (PC[iP3][PC_rStart]==rStart) && PC[iP3][PC_Length]= PC[iP1][PC_rStart] ) {//is new seed within an old seed + if ( rStart+L <= PC[iP1][PC_rStart]+PC[iP1][PC_Length] ) {//new seed is within the old piece + //decide whether to keep the new one + if ( (PC[iP1][PC_Nrep]==Nrep)) {//seeds map the same number of times == to the same loci + if (nRemove>0) {//debug + cout << "BUG: nRemove="<logMain, EXIT_CODE_RUNTIME, P); + }; + for (int ii=nP-1;ii>=iP;ii--) {//move old entries to free space for the new one + for (int jj=0;jjuniqLmax) { + uniqLmax=L; + uniqLmaxInd=nP-1; + }; + } else { + if ( Nrep < multNmin || multNmin==0 ) {multNmin=Nrep; multNminL=L;}; + if ( L > multLmax ) {multLmax=L;multLmaxN=Nrep;}; + if ( Nrep > multNmax ) {multNmax=Nrep; multNmaxL=L;}; + }; +}; diff --git a/star-sys/STAR/source/ReadAlign_waspMap.cpp b/star-sys/STAR/source/ReadAlign_waspMap.cpp new file mode 100644 index 0000000..1d61974 --- /dev/null +++ b/star-sys/STAR/source/ReadAlign_waspMap.cpp @@ -0,0 +1,103 @@ +#include "ReadAlign.h" + +void ReadAlign::waspMap() { + if (!P.wasp.yes || trBest->varAllele.size()==0) {//no variants, vW tag will not be output + waspType=-1; + return; + } else if (nTr>1) {//multimapping read + waspType=2; + return; + } else if (trBest->varAllele.size()>10) {//multimapping read + waspType=7; + return; + }; + + + waspRA->copyRead(*this); + + vector vA=trBest->varAllele; + + for (const auto& a : vA) { + if (a>3) {//read has N for the variant, drop it + waspType=3; + return; + }; + }; + + + + vector> vvA {{}}; //all combinations + for (const auto& u : vA) {//cycle over vars, each time add new variant by adding 2 variants to each of the existing combinations + (void) u; //to avoid unused warning + vector> r; //temp + for (const auto& x : vvA) { + for (const auto& y:{1,2}) { + r.push_back(x); + r.back().push_back(y); + }; + }; + vvA=move(r); + }; + + + for (const auto& vA1 : vvA) {//cycle over all combinations + + if (vA1==vA) + continue; //this combination was already mapped as the real read + + for (uint iv=0; ivsnp.nt[trBest->varInd.at(iv)][vA1.at(iv)]; //the other allele + uint vr=trBest->varReadCoord.at(iv);//read coordinate + + if (trBest->Str==1) {//variant was found on the - strand alignment + nt2=3-nt2; + vr=Lread-1-vr; + }; + waspRA->Read1[0][vr] =nt2; + waspRA->Read1[1][vr] =3-nt2; + waspRA->Read1[2][Lread-1-vr]=3-nt2; + }; + + waspRA->mapOneRead(); + waspRA->multMapSelect(); + waspRA->mappedFilter(); + + if (waspRA->unmapType!=-1) { + waspType=4; + return; + } else if (waspRA->nTr>1) { + waspType=5; + return; + } else if (waspRA->trBest->nExons!=trBest->nExons) { + waspType=6; + return; + } else { + for (uint ii=0; iinExons; ii++) { + for (uint jj=0; jj<=2; jj++) { + if (trBest->exons[ii][jj]!=waspRA->trBest->exons[ii][jj]) { + waspType=6; + return;//this combination maps to a different place, return with waspType 0 (set above) + }; + }; + }; + }; + }; + waspType=1; //all combinations resulted in the same alignment + return; +}; + +void ReadAlign::copyRead(ReadAlign &r) {//copy read information only + Lread=r.Lread; + readLength[0]=r.readLength[0];readLength[1]=r.readLength[1]; + readLengthOriginal[0]=r.readLengthOriginal[0];readLengthOriginal[1]=r.readLengthOriginal[1]; + readLengthPairOriginal=r.readLengthPairOriginal; + outFilterMismatchNmaxTotal=r.outFilterMismatchNmaxTotal; + readName=r.readName; + + for (uint ii=0;ii<=2;ii++) + memcpy(Read1[ii],r.Read1[ii],Lread);//need to copy since it will be changed + Qual1=r.Qual1; + +}; diff --git a/star-sys/STAR/source/SequenceFuns.cpp b/star-sys/STAR/source/SequenceFuns.cpp new file mode 100644 index 0000000..f2476cb --- /dev/null +++ b/star-sys/STAR/source/SequenceFuns.cpp @@ -0,0 +1,353 @@ +#include "SequenceFuns.h" + +void complementSeqNumbers(char* ReadsIn, char* ReadsOut, uint Lread) {//complement the numeric sequences + for (uint jj=0;jj3) {//N + if (posN>=0) + return -2; //two Ns + posN=ii; + nt=0; + }; + intOut = intOut << 2; + intOut +=nt; + //intOut += nt<<(2*ii); + }; + return posN; +}; + +string convertNuclInt32toString(uint32 nuclNum, const uint32 L) { + string nuclOut(L,'N'); + string nuclChar="ACGT"; + + for (uint32 ii=1; ii<=L; ii++) { + nuclOut[L-ii] = nuclChar[nuclNum & 3]; + nuclNum = nuclNum >> 2; + }; + + return nuclOut; +}; + +int64 convertNuclStrToInt64(const string S, uint64 &intOut) { + intOut=0; + int64 posN=-1; + for (uint64 ii=0; ii3) {//N + if (posN>=0) + return -2; //two Ns + posN=ii; + nt=0; + }; + intOut = intOut << 2; + intOut +=nt; + //intOut += nt<<(2*ii); + }; + return posN; +}; + +string convertNuclInt64toString(uint64 nuclNum, const uint32 L) { + string nuclOut(L,'N'); + string nuclChar="ACGT"; + + for (uint64 ii=1; ii<=L; ii++) { + nuclOut[L-ii] = nuclChar[nuclNum & 3]; + nuclNum = nuclNum >> 2; + }; + + return nuclOut; +}; + + +uint chrFind(uint Start, uint i2, uint* chrStart) {// find chromosome from global locus + uint i1=0, i3; + while (i1+1 Start ) { + i2=i3; + } else { + i1=i3; + }; + }; + return i1; +}; + +uint localSearch(const char *x, uint nx, const char *y, uint ny, double pMM){ + //find the best alignment of two short sequences x and y + //pMM is the maximum percentage of mismatches + uint nMatch=0, nMM=0, nMatchBest=0, nMMbest=0, ixBest=nx; + for (uint ix=0;ix3) continue; + if (x[ix+iy]==y[iy]) { + nMatch++; + } else { + nMM++; + }; + }; + + if ( ( nMatch>nMatchBest || (nMatch==nMatchBest && nMMnMatchBest || (nMatch==nMatchBest && nMM3) ) ) { + if (r[iR]==MARK_FRAG_SPACER_BASE) iFrag++; //count read fragments + iR++; + }; + + if (iR==L) break; //exit when reached end of read + + iR1=iR; + + //find the next bad base + while ( iR=Qsplit && r[iR]<=3 ) { + iR++; + }; + + if ( (iR-iR1)>LgoodMin ) LgoodMin=iR-iR1; + if ( (iR-iR1) +#include +#include +#include /* For mode constants */ +#include /* For O_* constants */ +#include +#include + +#ifdef COMPILE_FOR_MAC + //some Mac's idiosyncrasies: standard SHM libraries are very old and missing some definitions + #define SHM_NORESERVE 0 +#endif + +using namespace std; + +SharedMemory::SharedMemory(key_t key, bool unloadLast): _key(key), _counterKey(key+1), _unloadLast(unloadLast), _err(&cerr) +{ + _shmID = -1; + _sharedCounterID = -1; + _counterMem = 0; + _mapped=NULL; + _length = NULL; + _sem=NULL; + _isAllocator = false; + _needsAllocation = true; + + EnsureCounter(); + OpenIfExists(); +} + +SharedMemory::~SharedMemory() +{ + try + { + int inUse = SharedObjectsUseCount()-1; + Close(); + + if (_unloadLast) + { + if (inUse > 0) + { + (*_err) << inUse << " other job(s) are attached to the shared memory segment, will not remove it." <=0; + if (! (exists || errno == ENOENT)) + ThrowError(EOPENFAILED, errno); // it's there but we couldn't get a handle + + if (exists) + { + MapSharedObjectToMemory(); + + _needsAllocation = false; + } +} + +#ifdef POSIX_SHARED_MEM +struct stat SharedMemory::GetSharedObjectInfo() +{ + struct stat buf; + int err = fstat(_shmID, &buf); + if (err == -1) + ThrowError(EOPENFAILED, errno); + + return buf; +} +#endif + +void SharedMemory::MapSharedObjectToMemory() +{ +#ifdef POSIX_SHARED_MEM + size_t size=0; + struct stat buf = SharedMemory::GetSharedObjectInfo(); + size = (size_t) buf.st_size; + _mapped = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, _shmID, (off_t) 0); + + if (_mapped==((void *) -1)) + ThrowError(EMAPFAILED, errno); + + _length = (size_t *) _mapped; + *_length = size; +#else + _mapped= shmat(_shmID, NULL, 0); + + if (_mapped==((void *) -1)) + ThrowError(EMAPFAILED, errno); + + _length = (size_t *) _mapped; +#endif +} + +void SharedMemory::Close() +{ + #ifdef POSIX_SHARED_MEM + if (_mapped != NULL) + { + int ret = munmap(_mapped, (size_t) *_length); + if (ret == -1) + ThrowError(EMAPFAILED, errno); + _mapped = NULL; + } + + if (_shmID != -1) + { + int err = close(_shmID); + _shmID=-1; + if (err == -1) + ThrowError(ECLOSE, errno); + } + + #else + if (_mapped != NULL) + { + shmdt(_mapped); + _mapped = NULL; + } + #endif +} + +void SharedMemory::Unlink() +{ + if (!_needsAllocation) + { + int shmStatus=-1; + #ifdef POSIX_SHARED_MEM + shmStatus = shm_unlink(GetPosixObjectKey().c_str()); + #else + struct shmid_ds buf; + shmStatus=shmctl(_shmID,IPC_RMID,&buf); + #endif + if (shmStatus == -1) + ThrowError(EUNLINK, errno); + + _needsAllocation = true; + } +} + +void SharedMemory::Clean() +{ + Close(); + Unlink(); + RemoveSharedCounter(); +} + +void SharedMemory::EnsureCounter() +{ + if (_sharedCounterID < 0) + _sharedCounterID=shmget(_counterKey,0,0); + + bool exists=_sharedCounterID>=0; + + if (!exists) + { + errno=0; + _sharedCounterID=shmget(_counterKey, 1, IPC_CREAT | IPC_EXCL | SHM_NORESERVE | 0666); + + if (_sharedCounterID < 0) + ThrowError(ECOUNTERCREATE, errno); + } + + if (_counterMem == 0) + { + _counterMem = shmat(_sharedCounterID, NULL, 0); + + if (_counterMem==((void *) -1)) + ThrowError(EMAPFAILED, errno); + } +} + +void SharedMemory::RemoveSharedCounter() +{ + struct shmid_ds buf; + int shmStatus=shmctl(_sharedCounterID,IPC_RMID,&buf); + if (shmStatus == -1) + ThrowError(ECOUNTERREMOVE, errno); +} + +int SharedMemory::SharedObjectsUseCount() +{ + EnsureCounter(); + if (_sharedCounterID != -1) + { + struct shmid_ds shmStat; + int shmStatus=shmctl(_sharedCounterID,IPC_STAT,&shmStat); + if (shmStatus == -1) + ThrowError(ECOUNTERUSE, errno); + + return shmStat.shm_nattch; + } + else + return -1; +} diff --git a/star-sys/STAR/source/SharedMemory.h b/star-sys/STAR/source/SharedMemory.h new file mode 100644 index 0000000..13a3e77 --- /dev/null +++ b/star-sys/STAR/source/SharedMemory.h @@ -0,0 +1,182 @@ +// SharedMemory.cpp +// Gery Vessere - gvessere@illumina.com, gery@vessere.com +// An abstraction over both SysV and POSIX shared memory APIs + +#ifndef SHAREDMEMORY_H +#define SHAREDMEMORY_H + + +#include +#include +#include +#include +#include + + +enum ErrorState { +ENONE, +ENOTALLOCATED, +ETRYAGAIN, +EALREADYALLOCATED, +EOPENFAILED, +EEXISTS, +EFTRUNCATE, +EMAPFAILED, +ECLOSE, +EUNLINK, +ECOUNTERCREATE, +ECOUNTERREMOVE, +ECOUNTERUSE +}; + + +class SharedMemoryException: public std::exception +{ +private: + bool _hasError; + ErrorState _error; + int _errorDetail; + +public: + + SharedMemoryException() + { + _hasError = false; + _error = ENONE; + _errorDetail = 0; + }; + + SharedMemoryException(ErrorState error): _error(error) + {}; + + ErrorState GetErrorCode() const + { + return _error; + }; + + int GetErrorDetail() const + { + return _errorDetail; + } + + bool HasError() const + { + return _hasError; + }; + + void SetError(ErrorState error, int detail) + { + if (!_hasError) + { + _hasError = true; + _error = error; + _errorDetail = detail; + } + } + + void ClearError() + { + _hasError = false; + _error = ENONE; + _errorDetail = 0; + }; +}; + +class SharedMemory +{ +public: + void * GetMapped() + { + return (void *) ((char*) _mapped + sizeof(size_t)); + }; + + size_t GetSize() + { + if (!_needsAllocation) + return *_length - sizeof(size_t); + + _exception.SetError(ENOTALLOCATED, 0); + return -1; + }; + + bool NeedsAllocation() const + { + return _needsAllocation; + }; + + // the owner is the first one that created the named shared memory segment + bool IsAllocator() const + { + return _isAllocator; + }; + + int GetId() const + { + return _shmID; + }; + + bool HasError() const + { + return _exception.HasError(); + } + + void ThrowError(ErrorState error, int detail) + { + if (!_exception.HasError()) + { + _exception.SetError(error, detail); + } + throw _exception; + }; + + void ThrowError(ErrorState error) + { + ThrowError(error, 0); + }; + + void SetErrorStream(std::ostream * err) + { + _err = err; + }; + + SharedMemory(key_t key, bool unloadLast); + ~SharedMemory(); + void Allocate(size_t shmSize); + void Clean(); + +private: + SharedMemoryException _exception; + + int _shmID; + int _sharedCounterID; + void * _counterMem; + + void * _mapped; + size_t * _length; + sem_t * _sem; + bool _isAllocator; + bool _needsAllocation; + + key_t _key; + key_t _counterKey; + bool _unloadLast; + std::ostream * _err; + + int SharedObjectsUseCount(); + void OpenIfExists(); + void CreateAndInitSharedObject(size_t shmSize); + void MapSharedObjectToMemory(); + std::string GetPosixObjectKey(); + struct stat GetSharedObjectInfo(); + void Close(); + void Unlink(); + + std::string CounterName(); + + void EnsureCounter(); + void RemoveSharedCounter(); + void SharedUseIncrement(); + void SharedUseDecrement(); +}; + +#endif diff --git a/star-sys/STAR/source/SjdbClass.h b/star-sys/STAR/source/SjdbClass.h new file mode 100644 index 0000000..9d518e6 --- /dev/null +++ b/star-sys/STAR/source/SjdbClass.h @@ -0,0 +1,17 @@ +#ifndef DEF_SjdbClass +#define DEF_SjdbClass + +#include "IncludeDefine.h" +#include + +class SjdbClass { +public: + vector chr; + vector start,end; + vector str; + vector priority; + + vector> gene; +}; + +#endif diff --git a/star-sys/STAR/source/Solo.cpp b/star-sys/STAR/source/Solo.cpp new file mode 100644 index 0000000..e63b2bb --- /dev/null +++ b/star-sys/STAR/source/Solo.cpp @@ -0,0 +1,23 @@ +#include "Solo.h" + +Solo::Solo(ReadAlignChunk **RAchunkIn, Parameters &Pin, Transcriptome &inTrans) + : RAchunk(RAchunkIn), P(Pin), pSolo(P.pSolo), Trans(inTrans) +{ + if (pSolo.type==0 ) + return; + + soloFeat = new SoloFeature*[pSolo.nFeatures]; + for (uint32 ii=0; iiprocessRecords(RAchunk); + }; +}; diff --git a/star-sys/STAR/source/Solo.h b/star-sys/STAR/source/Solo.h new file mode 100644 index 0000000..e2e17cb --- /dev/null +++ b/star-sys/STAR/source/Solo.h @@ -0,0 +1,26 @@ +#ifndef H_Solo +#define H_Solo +#include "IncludeDefine.h" +#include "ReadAlignChunk.h" +#include "Transcriptome.h" +#include + +#include "SoloFeature.h" + + +class Solo { +public: + + SoloFeature **soloFeat; + + Solo(ReadAlignChunk **RAchunk, Parameters &Pin, Transcriptome &inTrans); + void processAndOutput(); + +private: + ReadAlignChunk **RAchunk; + Parameters &P; + ParametersSolo &pSolo; + Transcriptome &Trans; +}; + +#endif diff --git a/star-sys/STAR/source/SoloFeature.cpp b/star-sys/STAR/source/SoloFeature.cpp new file mode 100644 index 0000000..5757e46 --- /dev/null +++ b/star-sys/STAR/source/SoloFeature.cpp @@ -0,0 +1,17 @@ +#include "SoloFeature.h" +#include "streamFuns.h" + +SoloFeature::SoloFeature(int feTy, const Parameters &Pin, Transcriptome &inTrans) + : featureType(feTy), P(Pin), pSolo(P.pSolo), Trans(inTrans) +{ + + readFeatSum = new SoloReadFeature(featureType,P,-1); + readBarSum = new SoloReadBarcode(P); + readFeatAll = new SoloReadFeature*[P.runThreadN]; + readBarAll = new SoloReadBarcode*[P.runThreadN]; + + if (pSolo.type==0) + return; + + statsStream = &ofstrOpen(P.outFileNamePrefix+pSolo.outFileNames[0]+pSolo.featureNames[featureType]+".stats",ERROR_OUT, P); +}; diff --git a/star-sys/STAR/source/SoloFeature.h b/star-sys/STAR/source/SoloFeature.h new file mode 100644 index 0000000..c72b891 --- /dev/null +++ b/star-sys/STAR/source/SoloFeature.h @@ -0,0 +1,45 @@ +#ifndef H_SoloFeature +#define H_SoloFeature +#include "IncludeDefine.h" +#include "ReadAlignChunk.h" +#include "Transcriptome.h" +#include + +#include "SoloRead.h" + +class SoloFeature { +public: + + SoloReadFeature *readFeatSum, **readFeatAll; + SoloReadBarcode *readBarSum, **readBarAll; + + uint64 nReadsMapped, nCB; //total number of mapped reads + + uint32 *rGeneUMI;//mapped reads sorted by CB + uint32 *indCB;//index of detected CBs in the whitelist + uint32 *rCBn;//number of reads for detected CBs in the whitelist + uint32 **rCBp;//array of pointers to each CB sub-array + uint32 *nUperCB;//number of UMIs per CB + uint32 *nGperCB;//number of genes (with >0 UMIs) per CB + uint32 nCellGeneEntries;//total number of non-zero cell/gene combinations (entries in the output matrix) + + ofstream *statsStream; + + array,2> sjAll; + + SoloFeature(int feTy, const Parameters &Pin, Transcriptome &inTrans); + void processRecords(ReadAlignChunk **RAchunk); + void collapseUMI(uint32 *rGU, uint32 rN, uint32 &nGenes, uint32 &nUtot, uint32 *umiArray); + void outputResults(); + +private: + const int32 featureType; + + const Parameters &P; + const ParametersSolo &pSolo; + Transcriptome &Trans; + + static const uint32 umiArrayStride=3; +}; + +#endif diff --git a/star-sys/STAR/source/SoloFeature_collapseUMI.cpp b/star-sys/STAR/source/SoloFeature_collapseUMI.cpp new file mode 100644 index 0000000..9ec83bd --- /dev/null +++ b/star-sys/STAR/source/SoloFeature_collapseUMI.cpp @@ -0,0 +1,185 @@ +#include "SoloFeature.h" +#include "streamFuns.h" +#include "TimeFunctions.h" +#include "serviceFuns.cpp" + +#define def_MarkNoColor (uint32) -1 + +void collapseUMIwith1MMlowHalf(uint32 *rGU, uint32 umiArrayStride, uint32 umiMaskLow, uint32 nU0, uint32 &nU1, uint32 &nU2, uint32 &nC, vector> &vC) +{ + const uint32 bitTop=1<<31; + const uint32 bitTopMask=~bitTop; + + for (uint32 iu=0; iu umiMaskLow) + break; //upper half is different + + if (uuXor >> (__builtin_ctz(uuXor)/2)*2 > 3) //shift by even number of trailing zeros + continue;//>1MM + + //1MM UMI + + //graph coloring + if ( rGU[iu+2] == def_MarkNoColor && rGU[iuu+2] == def_MarkNoColor ) {//no color + //new color + rGU[iu+2] = nC; + rGU[iuu+2] = nC; + ++nC; + nU1 -= 2;//subtract the duplicated UMIs + } else if ( rGU[iu+2] == def_MarkNoColor ) { + rGU[iu+2] = rGU[iuu+2]; + --nU1;//subtract the duplicated UMIs + } else if ( rGU[iuu+2] == def_MarkNoColor ) { + rGU[iuu+2] = rGU[iu+2]; + --nU1;//subtract the duplicated UMIs + } else {//both color + if (rGU[iuu+2] != rGU[iu+2]) {//color conflict + //uint32 p[2]={rGU[iu+2],rGU[iuu+2]}; + vC.push_back({rGU[iu+2],rGU[iuu+2]}); + //vC.push_back({rGU[iuu+2],rGU[iu+2]}); + }; + }; + + //directional collapse + if ( (rGU[iuu+1] & bitTop) == 0 && (rGU[iu+1] & bitTopMask)>(2*(rGU[iuu+1] & bitTopMask)-1) ) {//iuu is duplicate of iu + rGU[iuu+1] |= bitTop; + --nU2;//subtract the duplicated UMIs + } else if ( (rGU[iu+1] & bitTop) == 0 && (rGU[iuu+1] & bitTopMask)>(2*(rGU[iu+1] & bitTopMask)-1) ) {//iu is duplicate of iuu + rGU[iu+1] |= bitTop; + --nU2;//subtract the duplicated UMIs + }; + }; + }; +}; + +void graphDepthFirstSearch(uint32 n, vector &nodeVisited, vector> &nodeEdges) { + for (const auto &nn : nodeEdges[n]) { + if (!nodeVisited[nn]) { + nodeVisited[nn]=true; + graphDepthFirstSearch(nn,nodeVisited,nodeEdges); + }; + }; +}; + +uint32 graphNumberOfConnectedComponents(uint32 N, vector> V) {//find number of connected components + //N=number of nodes + //V=edges, list of connected nodes, each pair of nodes listed once + //simple recursive DFS + + //sort +// qsort(V.data(),V.size(),2*sizeof(uint32),funCompareNumbers); + if (V.size()==0) + return N; + + vector> nodeEdges (N); + for (uint32 ii=0; ii nodeVisited(N,false); + + uint32 nConnComp=0; + for (uint32 ii=0; ii); //sort by gene number + + //compact reads per gene + uint32 gid1=-1;//current gID + nGenes=0; //number of genes + uint32 *gID = new uint32[min(Trans.nGe,rN)+1]; //gene IDS + uint32 *gReadS = new uint32[min(Trans.nGe,rN)+1]; //start of gene reads TODO: allocate this array in the 2nd half of rGU + for (uint32 iR=0; iR<2*rN; iR+=2) { + if (rGU[iR]!=gid1) {//record gene boundary + gReadS[nGenes]=iR; + gid1=rGU[iR]; + gID[nGenes]=gid1; + ++nGenes; + }; + rGU[iR]=rGU[iR+1]; //shift UMIs + //rGU[iR+1] storage this will be used later for counting + }; + gReadS[nGenes]=2*rN;//so that gReadS[nGenes]-gReadS[nGenes-1] is the number of reads for nGenes + + uint32 *nUg = new uint32[nGenes*3];//3 types of counts + nUtot=0; + for (uint32 iG=0; iG); + + //exact collapse + uint32 iR1=-umiArrayStride; //number of distinct UMIs for this gene + uint32 u1=-1; + for (uint32 iR=0; iRnRumiMax) nRumiMax=umiArray[iR1+1]; + }; + uint32 nU0=(iR1+umiArrayStride)/umiArrayStride; + + //collapse with 1MM + uint32 nU1=nU0, nU2=nU0;//2 types of 1MM collapsing + uint32 nC=0; //graph colors + vector> vC;//color connections + + collapseUMIwith1MMlowHalf(umiArray, umiArrayStride, pSolo.umiMaskLow, nU0, nU1, nU2, nC, vC); + + //exchange low and high half of UMIs, re-sort, and look for 1MM again + for (uint32 iu=0; iu>(pSolo.umiL); + umiArray[iu] &= pSolo.umiMaskLow; //remove high + umiArray[iu] <<= (pSolo.umiL); //move low to high + umiArray[iu] |= high; //add high + }; + qsort(umiArray, nU0, umiArrayStride*sizeof(uint32), funCompareNumbers); + collapseUMIwith1MMlowHalf(umiArray, umiArrayStride, pSolo.umiMaskLow, nU0, nU1, nU2, nC, vC); + + nUg[3*iG]=nU0; + nUg[3*iG+1]=nU1+graphNumberOfConnectedComponents(nC,vC); + nUg[3*iG+2]=nU2; + nUtot+=nUg[3*iG+1]; + }; + + uint32 *rGUp=rGU; + for (uint32 iG=0; iG1) {//record 2 more counts + rGUp[2]=nUg[3*iG+1]; + rGUp[3]=nUg[3*iG+2]; + rGUp += 4; + } else {//only one count recorded, save space + rGUp += 2; + }; + }; + //cout << nRumiMax << '\n'; + +}; diff --git a/star-sys/STAR/source/SoloFeature_outputResults.cpp b/star-sys/STAR/source/SoloFeature_outputResults.cpp new file mode 100644 index 0000000..434d5a1 --- /dev/null +++ b/star-sys/STAR/source/SoloFeature_outputResults.cpp @@ -0,0 +1,59 @@ +#include "SoloFeature.h" +#include "streamFuns.h" +#include "TimeFunctions.h" +#include "serviceFuns.cpp" +#include "SequenceFuns.h" + +void SoloFeature::outputResults() +{ + if (featureType==0) {//this only need to be done once + //output genes + ofstream &geneStr=ofstrOpen(P.outFileNamePrefix+pSolo.outFileNames[0]+pSolo.outFileNames[1],ERROR_OUT, P); + for (uint32 ii=0; ii1) {//3 counts recorded + count1[0] = rCBpp[1]; + count1[1] = rCBpp[2]; + count1[2] = rCBpp[3]; + rCBpp += 4; + } else {//1 recorded + rCBpp +=2; + }; + for (uint32 ii=0; iiflush(); +}; diff --git a/star-sys/STAR/source/SoloFeature_processRecords.cpp b/star-sys/STAR/source/SoloFeature_processRecords.cpp new file mode 100755 index 0000000..a4b9dff --- /dev/null +++ b/star-sys/STAR/source/SoloFeature_processRecords.cpp @@ -0,0 +1,115 @@ +#include "SoloFeature.h" +#include "streamFuns.h" +#include "TimeFunctions.h" + +void SoloFeature::processRecords(ReadAlignChunk **RAchunk) +{ + if (pSolo.type==0) + return; + + time_t rawTime; + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Starting Solo post-map for " <RA->soloRead->readFeat[pSolo.featureInd[featureType]]; + readBarAll[ii]=RAchunk[ii]->RA->soloRead->readBar; + }; + + for (int ii=0; iiaddCounts(*readFeatAll[ii]); + readBarSum->addCounts(*readBarAll[ii]); + }; + + if (!pSolo.cbWLyes) {//now we can define WL and counts + //pSolo.cbWL.resize(readFeatSum->cbReadCountMap.size()); + readFeatSum->cbReadCount = new uint32[pSolo.cbWL.size()]; + readBarSum->cbReadCountExact = new uint32[pSolo.cbWL.size()]; + + uint64 icb=0; + for (auto ii=readFeatSum->cbReadCountMap.cbegin(); ii!=readFeatSum->cbReadCountMap.cend(); ++ii) { + //pSolo.cbWL[icb]=ii->first; + readFeatSum->cbReadCount[icb]=ii->second; + readBarSum->cbReadCountExact[icb]=ii->second; + ++icb; + }; + }; + + //allocate arrays to store CB/gene/UMIs for all reads + nCB=0;nReadsMapped=0; + for (uint32 ii=0; iicbReadCountExact[ii]>0) { + nCB++; + nReadsMapped += readFeatSum->cbReadCount[ii]; + }; + }; + + rGeneUMI = new uint32[2*nReadsMapped]; //big array for all CBs - each element is gene and UMI + rCBp = new uint32*[nCB+1]; + uint32 **rCBpa = new uint32*[pSolo.cbWL.size()+1]; + indCB = new uint32[nCB]; + + uint32 nReadPerCBmax=0; + rCBp[0]=rGeneUMI; + rCBpa[0]=rGeneUMI; + nCB=0;//will count it again below + for (uint32 ii=0; iicbReadCountExact[ii]>0) {//if no exact matches, this CB is not present + indCB[nCB]=ii; + rCBp[nCB+1] = rCBp[nCB] + 2*readFeatSum->cbReadCount[ii]; + ++nCB; + }; + rCBpa[ii+1]=rCBp[nCB]; + }; + + //read and store the CB/gene/UMI from files + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished allocating arrays for Solo " << nReadsMapped*2.0*8/1024/1024/1024 <<" GB" <inputRecords(rCBpa,readBarSum->cbReadCountExact); + }; + + for (uint32 iCB=0; iCBnReadPerCBmax) + nReadPerCBmax=nr; + readFeatSum->stats.V[readFeatSum->stats.nMatch] += nr; + }; + + for (int ii=0; iiaddStats(*readFeatAll[ii]); + readBarSum->addStats(*readBarAll[ii]); + }; + + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished reading reads from Solo files nCB="<statsOut(*statsStream); + *statsStream << setw(50)<< pSolo.featureNames[featureType] <<":\n"; + readFeatSum->statsOut(*statsStream); + + //output nU per gene per CB + outputResults(); + +}; diff --git a/star-sys/STAR/source/SoloRead.cpp b/star-sys/STAR/source/SoloRead.cpp new file mode 100755 index 0000000..272e066 --- /dev/null +++ b/star-sys/STAR/source/SoloRead.cpp @@ -0,0 +1,14 @@ +#include "SoloRead.h" + +SoloRead::SoloRead(const Parameters &Pin, int32 iChunkIn) : iChunk(iChunkIn), P(Pin), pSolo(P.pSolo) +{ + readBar = new SoloReadBarcode(P); + + if (pSolo.type==0) + return; + + readFeat = new SoloReadFeature*[pSolo.nFeatures]; + + for (uint32 ii=0; ii &readTrGenes, set &readGenes, Transcript *alignOut); + +private: + const int32 iChunk; + const Parameters &P; + const ParametersSolo &pSolo; +}; + +#endif diff --git a/star-sys/STAR/source/SoloReadBarcode.cpp b/star-sys/STAR/source/SoloReadBarcode.cpp new file mode 100755 index 0000000..57267f6 --- /dev/null +++ b/star-sys/STAR/source/SoloReadBarcode.cpp @@ -0,0 +1,49 @@ +#include "SoloReadBarcode.h" +#include "streamFuns.h" + +SoloReadBarcode::SoloReadBarcode(const Parameters &Pin) : P(Pin), pSolo(P.pSolo) +{ + if (pSolo.type==0) + return; + + for (uint32 ii=0; ii +#include "IncludeDefine.h" +#include "Parameters.h" + +class SoloReadBarcode { +public: + uint32 homoPolymer[4];//homopolymer constants + string cbSeq, umiSeq, cbQual, umiQual; + uint64 cbB; + uint32 umiB; + int64 cbI; + int32 cbMatch;//0=exact, 1=1 match with 1MM, 2= >1 matches with 1MM + string cbMatchString;//CB matches and qualities + vector cbMatchInd;//matches + uint32 *cbReadCountExact; + + struct { + enum { nNinBarcode, nUMIhomopolymer, nTooMany, nNoMatch, nStats}; + uint64 V[nStats]; + vector names={"nNinBarcode","nUMIhomopolymer","nTooMany","nNoMatch"}; + } stats; + + SoloReadBarcode(const Parameters &Pin); + void getCBandUMI(string &readNameExtra); + void addCounts(const SoloReadBarcode &rfIn); + void addStats(const SoloReadBarcode &rfIn); + void statsOut(ofstream &streamOut); + +private: + const Parameters &P; + const ParametersSolo &pSolo; +}; + +#endif diff --git a/star-sys/STAR/source/SoloReadBarcode_getCBandUMI.cpp b/star-sys/STAR/source/SoloReadBarcode_getCBandUMI.cpp new file mode 100755 index 0000000..50192dc --- /dev/null +++ b/star-sys/STAR/source/SoloReadBarcode_getCBandUMI.cpp @@ -0,0 +1,97 @@ +#include "SoloReadBarcode.h" +#include "serviceFuns.cpp" +#include "SequenceFuns.h" + +void SoloReadBarcode::getCBandUMI(string &readNameExtra) +{ + if (pSolo.type==0) + return; + cbI=-999; + + cbMatch=-1; + cbMatchString=""; + + cbSeq=readNameExtra.substr(pSolo.cbS-1,pSolo.cbL); + umiSeq=readNameExtra.substr(pSolo.umiS-1,pSolo.umiL); + + uint32 qualStart = readNameExtra.find(' ',pSolo.cbL+pSolo.umiL); + cbQual=readNameExtra.substr(qualStart+pSolo.cbS,pSolo.cbL); + umiQual=readNameExtra.substr(qualStart+pSolo.umiS,pSolo.umiL); + + //check UMIs, return if bad UMIs + if (convertNuclStrToInt32(umiSeq,umiB)!=-1) {//convert and check for Ns + stats.V[stats.nNinBarcode]++;//UMIs are not allowed to have Ns + return; + }; + if (umiB==homoPolymer[0] || umiB==homoPolymer[1] || umiB==homoPolymer[2] || umiB==homoPolymer[3]) { + stats.V[stats.nUMIhomopolymer]++; + return; + }; + + //convert CB and check for Ns + int64 posN=convertNuclStrToInt64(cbSeq,cbB); + + if (!pSolo.cbWLyes) {//no whitelist - no search + if (posN!=-1) {//Ns are present, discard this read + stats.V[stats.nNinBarcode]++; + } else {//no Ns + cbI=(int64) cbB;//all possible barcodes are accepted. This will overflow if CB is longer than 31b + cbMatch=0; + }; + return; + }; + + if (posN==-2) {//>2 Ns, might already be filtered by Illumina + stats.V[stats.nNinBarcode]++; + return; + } else if (posN==-1) {//no Ns, count only for featureType==gene + cbI=binarySearchExact(cbB,pSolo.cbWL.data(),pSolo.cbWL.size()); + if (cbI>=0) {//exact match + cbReadCountExact[cbI]++;//note that this simply counts reads per exact CB, no checks of genes or UMIs + cbMatch=0; + return; + }; + }; + + if (posN>=0) {//one N + uint32 posNshift=2*(pSolo.cbL-1-posN);//shift bits for posN + for (uint32 jj=0; jj<4; jj++) { + uint64 cbB1=cbB^(jj<(cbB1,pSolo.cbWL.data(),pSolo.cbWL.size()); + if (cbI1>=0) { + if (cbI>=0) {//had another match already + stats.V[stats.nTooMany]++; + return;//with N in CB, do not allow matching >1 in WL + }; + cbI=cbI1; + }; + }; + if (cbI>=0) { + cbMatch=1; + return; + } else {//no match + stats.V[stats.nNoMatch]++; + return; + }; + }; + + //look for 1MM, posN==-1, no Ns + cbMatch=0; + cbMatchInd.clear(); + for (uint32 ii=0; ii(cbB^(jj<<(ii*2)),pSolo.cbWL.data(),pSolo.cbWL.size()); + if (cbI1>=0) {//found match + //output all + cbI=cbI1; + cbMatchInd.push_back(cbI1); + ++cbMatch; + cbMatchString += ' ' +to_string(cbI1) + ' ' + cbQual.at(pSolo.cbL-1-ii); + }; + }; + }; + if (cbMatch==0) {//no matches + stats.V[stats.nNoMatch]++; + cbMatch=-1; + };// else cbMatch contains number of matches (1 or >1), and cbMatchString contains matches for >1 case +}; diff --git a/star-sys/STAR/source/SoloReadFeature.cpp b/star-sys/STAR/source/SoloReadFeature.cpp new file mode 100755 index 0000000..6586385 --- /dev/null +++ b/star-sys/STAR/source/SoloReadFeature.cpp @@ -0,0 +1,53 @@ +#include "SoloReadFeature.h" +#include "streamFuns.h" + +SoloReadFeature::SoloReadFeature(int32 feTy, const Parameters &Pin, int iChunk) + : featureType(feTy), P(Pin), pSolo(P.pSolo) +{ + if (pSolo.type==0) + return; + + for (uint32 ii=0; ii=0) { + strU_0 = &fstrOpen(P.outFileTmp+"/solo"+pSolo.featureNames[featureType]+"_0_"+std::to_string(iChunk),ERROR_OUT, P); + strU_1 = &fstrOpen(P.outFileTmp+"/solo"+pSolo.featureNames[featureType]+"_1_"+std::to_string(iChunk),ERROR_OUT, P); + strU_2 = &fstrOpen(P.outFileTmp+"/solo"+pSolo.featureNames[featureType]+"_2_"+std::to_string(iChunk),ERROR_OUT, P); + }; +}; + +void SoloReadFeature::addCounts(const SoloReadFeature &rfIn) +{ + if (pSolo.cbWLyes) {//WL + for (uint32 ii=0; iifirst] += ii->second; + }; + }; +}; + +void SoloReadFeature::addStats(const SoloReadFeature &rfIn) +{ + for (uint32 ii=0; ii +#include +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "SoloReadBarcode.h" + +class SoloReadFeature { +public: + + uint32 homoPolymer[4];//homopolymer constants + + uint32 *cbReadCount; + map cbReadCountMap; + + fstream *strU_0 ,*strU_1, *strU_2; //unique mappers, CB matches whitelist with 0,1>=2 MM + + struct { + enum { nUnmapped, nNoFeature, nAmbigFeature, nAmbigFeatureMultimap, nTooMany, nNoExactMatch, nExactMatch, nMatch, nCellBarcodes, nUMIs, nStats}; + uint64 V[nStats]; + vector names={"nUnmapped","nNoFeature","nAmbigFeature","nAmbigFeatureMultimap","nTooMany","nNoExactMatch","nExactMatch","nMatch","nCellBarcodes","nUMIs",}; + } stats; + + string cbSeq, umiSeq, cbQual, umiQual; + + SoloReadFeature (int32 feTy, const Parameters &Pin, int iChunk); + void record(SoloReadBarcode &soloBar, uint nTr, set &readGene, set &readGeneFull, Transcript *alignOut); + void addCounts(const SoloReadFeature &soloCBin); + void addStats(const SoloReadFeature &soloCBin); + void statsOut(ofstream &streamOut); + void inputRecords(uint32 **cbP, uint32 *cbReadCountExact); + +private: + const int32 featureType; + + const Parameters &P; + const ParametersSolo &pSolo; +}; + +#endif diff --git a/star-sys/STAR/source/SoloReadFeature_inputRecords.cpp b/star-sys/STAR/source/SoloReadFeature_inputRecords.cpp new file mode 100755 index 0000000..bb8cc49 --- /dev/null +++ b/star-sys/STAR/source/SoloReadFeature_inputRecords.cpp @@ -0,0 +1,100 @@ +#include +#include "SoloReadFeature.h" +#include "binarySearch2.h" +#include "serviceFuns.cpp" + + +bool inputFeatureUmi(fstream *strIn, int32 featureType, uint32 &feature, uint32 &umi, const array,2> &sjAll) +{ + if (!(*strIn >> umi)) //end of file + return false; + + if (featureType==0 || featureType==2) {//gene + *strIn >> feature; + } else if (featureType==1) {//sj + uint32 sj[2]; + *strIn >> sj[0] >> sj[1]; + feature=(uint32) binarySearch2(sj[0],sj[1],sjAll[0].data(),sjAll[1].data(),sjAll[0].size()); + }; + + return true; +}; + +void SoloReadFeature::inputRecords(uint32 **cbP, uint32 *cbReadCountExactTotal) +{ + {//load exact matches + strU_0->flush(); + strU_0->seekg(0,ios::beg); + uint32 feature, umi; + int64 cb; + while (inputFeatureUmi(strU_0, featureType, feature, umi, P.sjAll)) { + *strU_0 >> cb; + if (!pSolo.cbWLyes) //if no-WL, the full cbInteger was recorded - now has to be placed in order + cb=binarySearchExact(cb,pSolo.cbWL.data(),pSolo.cbWL.size()); + if (feature != (uint32)(-1)){ + cbP[cb][0]=feature; + cbP[cb][1]=umi; + cbP[cb]+=2; + stats.V[stats.nExactMatch]++; + }; + }; + }; + + if (!pSolo.cbWLyes) //no WL => no mismatch check + return; + + {//1 match + strU_1->flush(); + strU_1->seekg(0,ios::beg); + uint32 cb, feature, umi; + while (inputFeatureUmi(strU_1,featureType, feature, umi, P.sjAll)) { + *strU_1 >> cb; + if (cbReadCountExactTotal[cb]>0) { + if (feature != (uint32)(-1)){ + cbP[cb][0]=feature; + cbP[cb][1]=umi; + cbP[cb]+=2; + }; + } else { + stats.V[stats.nNoExactMatch]++; + }; + }; + }; + + {//2 matches + strU_2->flush(); + strU_2->seekg(0,ios::beg); + uint32 cb=0, feature, umi, ncb; + while (inputFeatureUmi(strU_2,featureType, feature, umi, P.sjAll)) { + if (feature == (uint32) (-1)) { + strU_2->ignore((uint32) (-1),'\n');//ignore until the end of the line + continue; //nothing to record + }; + *strU_2 >> ncb; + float ptot=0.0,pmax=0.0; + for (uint32 ii=0; ii> cbin >> qin; + if (cbReadCountExactTotal[cbin]>0) {//otherwise this cbin does not work + qin -= pSolo.QSbase; + qin = qin < pSolo.QSmax ? qin : pSolo.QSmax; + pin=cbReadCountExactTotal[cbin]*std::pow(10.0,-qin/10.0); + ptot+=pin; + if (pin>pmax) { + cb=cbin; + pmax=pin; + }; + }; + }; + if (ptot>0.0 && pmax>=pSolo.cbMinP*ptot) { + cbP[cb][0]=feature; + cbP[cb][1]=umi; + cbP[cb]+=2; + } else { + stats.V[stats.nTooMany]++; + }; + }; + }; +}; diff --git a/star-sys/STAR/source/SoloReadFeature_record.cpp b/star-sys/STAR/source/SoloReadFeature_record.cpp new file mode 100755 index 0000000..87c3960 --- /dev/null +++ b/star-sys/STAR/source/SoloReadFeature_record.cpp @@ -0,0 +1,88 @@ +#include "SoloReadFeature.h" +#include "serviceFuns.cpp" +#include "SequenceFuns.h" + +uint32 outputReadCB(fstream *streamOut, int32 featureType, uint32 umiB, uint32 gene, vector> &readSJs, const string &stringCB) +{ + if (featureType==0 || featureType==2) {//genes + *streamOut << umiB <<' '<< gene <<' '<< stringCB <<'\n'; + return 1; + } else if (featureType==1) {//sjs + for (auto &sj : readSJs) { + *streamOut << umiB <<' '<< sj[0] <<' '<< sj[1] <<' '<< stringCB <<'\n'; + }; + return readSJs.size(); + }; + + return 0; //this should not happen +}; + +void SoloReadFeature::record(SoloReadBarcode &soloBar, uint nTr, set &readGene, set &readGeneFull, Transcript *alignOut) +{ + if (pSolo.type==0 || soloBar.cbMatch<0) + return; + + //unmapped + if (nTr==0) { + stats.V[stats.nUnmapped]++; + return; + }; + + vector> readSJs; + + set *readGe; + if (featureType==0) { + readGe = &readGene; + } else if (featureType==2) { + readGe = &readGeneFull; + }; + + if (featureType==0 || featureType==2) {//genes + //check genes, return if no gene of multimapping + if (readGe->size()==0) { + stats.V[stats.nNoFeature]++; + return; + }; + if (readGe->size()>1) { + stats.V[stats.nAmbigFeature]++; + if (nTr>1) + stats.V[stats.nAmbigFeatureMultimap]++; + return; + }; + } else if (featureType==1) {//SJs + if (nTr>1) {//reject all multimapping junctions + stats.V[stats.nAmbigFeatureMultimap]++; + return; + }; + + //for SJs, still check genes, return if multi-gene + if (readGene.size()>1) { + stats.V[stats.nAmbigFeature]++; + return; + }; + bool sjAnnot; + alignOut->extractSpliceJunctions(readSJs, sjAnnot); + if ( readSJs.empty() || (sjAnnot && readGene.size()==0) ) {//no junctions, or annotated junction buy no gene (i.e. read does not fully match transcript) + stats.V[stats.nNoFeature]++; + return; + }; + }; + + if (soloBar.cbMatch==0) {//exact match + uint32 n1 = outputReadCB(strU_0, featureType, soloBar.umiB, *readGe->begin(), readSJs, to_string(soloBar.cbI)); + if (pSolo.cbWL.size()>0) {//WL + cbReadCount[soloBar.cbI] += n1; + } else {//no WL + cbReadCountMap[soloBar.cbI] += n1; + }; + return; + } else if (soloBar.cbMatch==1) {//1 match with 1MM + cbReadCount[soloBar.cbI]+= outputReadCB(strU_1, featureType, soloBar.umiB, *readGe->begin(), readSJs, to_string(soloBar.cbI)); + return; + } else {//>1 matches + uint32 nfeat=outputReadCB(strU_2, featureType, soloBar.umiB, *readGe->begin(), readSJs, to_string(soloBar.cbMatch) + soloBar.cbMatchString); + for (auto &cbi : soloBar.cbMatchInd) + cbReadCount[cbi] += nfeat; + return; + }; +}; diff --git a/star-sys/STAR/source/SoloRead_record.cpp b/star-sys/STAR/source/SoloRead_record.cpp new file mode 100755 index 0000000..ed4cff3 --- /dev/null +++ b/star-sys/STAR/source/SoloRead_record.cpp @@ -0,0 +1,10 @@ +#include "SoloRead.h" + +void SoloRead::record(uint64 nTr, set &readGene, set &readGeneFull, Transcript *alignOut) +{ + if (pSolo.type==0) + return; + + for (uint32 ii=0; iirecord(*readBar, nTr, readGene, readGeneFull, alignOut); +}; diff --git a/star-sys/STAR/source/Stats.cpp b/star-sys/STAR/source/Stats.cpp new file mode 100644 index 0000000..e9c136e --- /dev/null +++ b/star-sys/STAR/source/Stats.cpp @@ -0,0 +1,154 @@ +#include "Stats.h" +#include "TimeFunctions.h" + +void Stats::resetN() {//zero all counters + readN = 0; readBases = 0; + mappedMismatchesN = 0; mappedInsN = 0; mappedDelN = 0; mappedInsL = 0; mappedDelL = 0; mappedBases = 0; mappedPortion = 0; + mappedReadsU = 0; mappedReadsM = 0; + unmappedOther = 0; unmappedShort = 0; unmappedMismatch = 0; unmappedMulti = 0; unmappedAll = 0; + chimericAll = 0; + splicesNsjdb=0; + for (uint ii=0; ii=0) splicesN[T.canonSJ[ii]]++; + if (T.sjAnnot[ii]==1) splicesNsjdb++; + }; + + mappedBases += mappedL; + mappedPortion += double(mappedL)/double(Lread); +}; + +#define SETW1 setw(9) +#define SETW2 setw(8) +#define SETW3 setw(12) + +void Stats::progressReportHeader(ofstream &progressStream) { + progressStream <=60.0 && readN>0) {//make the report + //progressStream.imbue(std::locale("")); + progressStream <0 ? readBases/readN : 0) \ + <0 ? double(mappedReadsU)/double(readN)*100 : 0) <<'%' \ + <0 ? double(mappedBases)/double(mappedReadsU) : 0) + <0 ? double(mappedMismatchesN)/double(mappedBases)*100 : 0) <<'%' \ + <0 ? double(mappedReadsM)/double(readN)*100 : 0) <<'%'\ + <0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%'\ + <0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%'\ + <0 ? double(unmappedShort)/double(readN)*100 : 0)<<'%'\ + <0 ? double(unmappedOther)/double(readN)*100 : 0) <<'%'\ + <<"\n"<0 ? readBases/readN : 0) <<"\n" \ + <0 ? double(mappedReadsU)/double(readN)*100 : 0) <<'%'<<"\n" \ + <0 ? double(mappedBases)/double(mappedReadsU) : 0) <<"\n"; + + streamOut <0 ? double(mappedDelL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \ + <0 ? double(mappedDelL)/double(mappedDelN) : 0) <<"\n" \ + <0 ? double(mappedInsL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \ + <0 ? double(mappedInsL)/double(mappedInsN) : 0) <<"\n" \ + <0 ? double(mappedReadsM)/double(readN)*100 : 0)<<'%' <<"\n" \ + <0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%' <<"\n" \ + <0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%' <<"\n" \ + <0 ? double(unmappedShort)/double(readN)*100 : 0) <<'%' <<"\n" \ + <0 ? double(unmappedOther)/double(readN)*100 :0) <<'%'<<"\n" \ + <0 ? double(chimericAll)/double(readN)*100 :0) <<'%'<<"\n" < outType, const string commStr, const string outStr) { + for (const auto& tt : outType) { + if (tt==1) { + if (outStr!="") + streamOut << commStr <<" "<< outStr <<"\n"; + streamOut << commStr <<" "<< "Nreads " << readN <<"\t"<< "NreadsUnique " << mappedReadsU <<"\t"<< "NreadsMulti " << mappedReadsM << "\n"; + }; + }; +}; + diff --git a/star-sys/STAR/source/Stats.h b/star-sys/STAR/source/Stats.h new file mode 100644 index 0000000..2f40b80 --- /dev/null +++ b/star-sys/STAR/source/Stats.h @@ -0,0 +1,38 @@ +#ifndef STATS_DEF +#define STATS_DEF + +#include "IncludeDefine.h" +#include "Transcript.h" +#include "Parameters.h" + + +class Stats { + public: + uint readN;//number of reads from the file + uint readBases;//number of input bases +// uint mateLmax[2], mateLmin[2];//mates' max and min length + + uint mappedReadsU, mappedReadsM; + uint mappedBases, mappedMismatchesN, mappedInsN, mappedDelN, mappedInsL, mappedDelL; + double mappedPortion; //portion of the read length that has been mapped + + uint splicesN[SJ_MOTIF_SIZE];//non-can,3*can,annotated + uint splicesNsjdb; + + uint unmappedOther, unmappedShort, unmappedMismatch, unmappedMulti, unmappedAll; + + uint chimericAll; + + time_t timeStart, timeStartMap, timeFinishMap, timeLastReport, timeFinish; + + Stats (); + void resetN(); + void printShort(ostream*); + void transcriptStats(Transcript &T, uint Lread); + void addStats(Stats &S); + void progressReportHeader(ofstream &progressStream); + void progressReport(ofstream &progressStream) ; + void reportFinal(ofstream &streamOut); + void writeLines(ofstream &streamOut, const vector outType, const string commStr, const string outStr);// write commented lines to text files with stats +}; +#endif diff --git a/star-sys/STAR/source/SuffixArrayFuns.cpp b/star-sys/STAR/source/SuffixArrayFuns.cpp new file mode 100644 index 0000000..835abac --- /dev/null +++ b/star-sys/STAR/source/SuffixArrayFuns.cpp @@ -0,0 +1,410 @@ +#include "SuffixArrayFuns.h" +#include "PackedArray.h" + +inline uint medianUint2(uint a, uint b) +{ + // returns (a+b)/2 + return a/2 + b/2 + (a%2 + b%2)/2; +}; + +uint compareSeqToGenome(const Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, bool& compRes) +{ + /* compare s to g, find the maximum identity length + * s2[0] read sequence; s2[1] complementary sequence + * S position to start search from in s2[0],s2[1] + * dirR forward or reverse direction search on read sequence + */ + + int64 ii; + + uint SAstr=mapGen.SA[iSA]; + bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome + SAstr &= mapGen.GstrandMask; + + char *g=mapGen.G; + + if (dirR && dirG) {//forward on read, forward on genome + char* s = s2[0] + S + L; + g += SAstr + L; + for (ii=0;(uint) ii < N-L; ii++) + { + if (s[ii]!=g[ii]) + { + if (s[ii]>g[ii]) + { + compRes=true; + return ii+L; + } else + { + compRes=false; + return ii+L; + }; + }; + }; +// if (s[ii]>g[ii]) {compRes=true;} else {compRes=false;}; + return N; //exact match + } else if (dirR && !dirG) { + char* s = s2[1] + S + L; + g += mapGen.nGenome-1-SAstr - L; + for (ii=0; (uint) ii < N-L; ii++) + { + if (s[ii]!=g[-ii]) + { + if (s[ii]>g[-ii] || g[-ii]>3) + { + compRes=false; + return ii+L; + } else + { + compRes=true; + return ii+L; + }; + }; + }; + return N; + } else if (!dirR && dirG) { + char* s = s2[1] + S - L; + g += SAstr + L; + for (ii=0; (uint) ii < N-L; ii++) + { + if (s[-ii]!=g[ii]) + { + if (s[-ii]>g[ii]) { + compRes=true; + return ii+L; + + } else + { + compRes=false; + return ii+L; + }; + }; + }; + return N; + } else {//if (!dirR && !dirG) + char* s = s2[0] + S - L; + g += mapGen.nGenome-1-SAstr - L; + for (ii=0; (uint) ii < N-L; ii++) + { + if (s[-ii]!=g[-ii]) + { + if (s[-ii]>g[-ii] || g[-ii]>3) + { + compRes=false; + return ii+L; + } else + { + compRes=true; + return ii+L; + }; + }; + }; + return N; + }; +}; + +uint findMultRange(const Genome &mapGen, uint i3, uint L3, uint i1, uint L1, uint i1a, uint L1a, uint i1b, uint L1b, char** s, bool dirR, uint S) +{ // given SA index i3 and identity length L3, return the index of the farthest element with the same length, starting from i1,L1 or i1a,L1a, or i1b,L1b + + bool compRes; + + if (L1i1a+1) ) { //L1a is the target length, i1a...i1b is the initial range, i1c,L1c is the value in the middle + uint i1c=medianUint2(i1a,i1b); + //uint L1c=identityLength(&g[mapGen.SA[i3]+L1b],&g[mapGen.SA[i1c]+L1b],L3-L1b)+L1b; + uint L1c=compareSeqToGenome(mapGen,s,S,L3,L1b,i1c,dirR,compRes); + if (L1c==L3) { + i1a=i1c; + } + else { //L1c=i2 an not iteration of the loope below is ever made + while (i1+1L1) { + L1b=L1a; L1a=L1; i1b=i1a; i1a=i1; + // L1b, i1b - captures history of last time the max score shifted. + // L1a, i1a - tracks current shift. + }; + i1=i3;L1=L3; + } + else { + if (L3>L2) { //move 2 to 3 + L2b=L2a; L2a=L2; i2b=i2a; i2a=i2; + }; + i2=i3;L2=L3; + }; + L= min(L1,L2); + + }; + + if (L3L2) { + i3=i1;L3=L1; + } else { + i3=i2;L3=L2; + }; + }; + // now i3,L3 is the "best" alignment, i.e. longest length + + // find the range of SA indices in which the identiyLength is the same + i1=findMultRange(mapGen,i3,L3,i1,L1,i1a,L1a,i1b,L1b,s,dirR,S); + i2=findMultRange(mapGen,i3,L3,i2,L2,i2a,L2a,i2b,L2b,s,dirR,S); + + L=L3; //output + indStartEnd[0]=i1; + indStartEnd[1]=i2; + + return i2-i1+1; +}; + + +int compareRefEnds (const Genome &mapGen, uint64 SAstr, uint64 gInsert, bool strG, bool strR) +{ + if ( strG) + {// + strand g + return strR ? (SAstr < gInsert ? 1:-1) : 1; + } else + {// - strand g + return strR ? -1 : ( gInsert==-1LLU ? -1 : ( SAstr < mapGen.nGenome-gInsert ? 1:-1) ); + }; +}; + +uint compareSeqToGenome1(const Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, uint64 gInsert, int & compRes) +{ + /* compare s to g, find the maximum identity length + * s2[0] read sequence; s2[1] complementary sequence + * S position to start search from in s2[0],s2[1] + * dirR: strand of the s + * different treatment of 5 (spacer) in the sequence and genome + * 5 is allowed in the sequence + * 5 in the genome is < than 5 in the sequence + */ + + //TODO no need for complementary sequence + + int64 ii; + + uint SAstr=mapGen.SA[iSA]; + bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome + SAstr &= mapGen.GstrandMask; + char *g=mapGen.G; + + if (dirG) {//forward on read, forward on genome + char* s = s2[0] + S + L; + g += SAstr + L; + for (ii=0;(uint) ii < N-L; ii++) + { + if (s[ii]!=g[ii]) + { + if (s[ii]>g[ii]) + { + compRes=1; + return ii+L; + } else + { + compRes=-1; + return ii+L; + }; + } else if (s[ii]==GENOME_spacingChar) + {//this already implies the s[ii]==g[ii] + compRes=compareRefEnds (mapGen, SAstr, gInsert, dirG, dirR); + return ii+L; + }; + }; +// if (s[ii]>g[ii]) {compRes=true;} else {compRes=false;}; + return N; //exact match + } + else + { + char* s = s2[1] + S + L; + g += mapGen.nGenome-1-SAstr - L; + for (ii=0; (uint) ii < N-L; ii++) + { + if (s[ii]!=g[-ii]) + { + char s1=s[ii],g1=g[-ii]; + if (s1<4) s1=3-s1; + if (g1<4) g1=3-g1; + if (s1>g1) { + compRes=1; + return ii+L; + } else + { + compRes=-1; + return ii+L; + }; + break; + } else if (s[ii]==GENOME_spacingChar) + {//this already implies the s[ii]==g[ii] + compRes=compareRefEnds (mapGen, SAstr, gInsert, dirG, dirR); + return ii+L; + }; + }; + return N; + }; +}; + + +uint suffixArraySearch1(const Genome &mapGen, char** s, uint S, uint N, uint64 gInsert, bool strR, uint i1, uint i2, uint L) +{ + /* binary search in SA space + * s[0],s[1] - query sequence, complementary sequence + * S - start offset + * N - sequence length + * g - genome sequence + * gInsert - index where the sequence insertion happened + * SA - suffix array + * strR - strand of the query sequence + * i1,i2 = starting indices in SA + * L - starting length + * output: first SA index > searched string, i.e. g[SA[index-1]]0) + {//the sequence is bigger than the last SA index, return a huge number + L=L2; + return -2llu; + }; + + L=min(L1,L2); + + uint i3=i1,L3=L1; //in case i1+1>=i2 an not iteration of the loope below is ever made + while (i1+10) + { //move 1 to 3 + i1=i3;L1=L3; + } else if (compRes<0) + {//move 2 to 3 + i2=i3;L2=L3; + } + L= min(L1,L2); + }; + return i2; //index at i2 is always bigger than the sequence +}; + +uint funCalcSAiFromSA(char* gSeq, PackedArray& gSA, const Genome &mapGen, uint iSA, int L, int & iL4) +{ + uint SAstr=gSA[iSA]; + bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome + SAstr &= mapGen.GstrandMask; + iL4=-1; + uint saind=0; + if (dirG) + { + uint128 g1=*( (uint128*) (gSeq+SAstr) ); + for (int ii=0; ii3) + { + iL4=ii; + saind <<= 2*(L-ii); + return saind; + }; + saind=saind<<2; + saind+=g2; + g1=g1>>8; + }; + return saind; + } else + { + uint128 g1=*( (uint128*) (gSeq+mapGen.nGenome-SAstr-16) ); + for (int ii=0; ii>(8*(15-ii))); + if (g2>3) + { + iL4=ii; + saind <<= 2*(L-ii); + return saind; + }; + saind=saind<<2; + saind+=3-g2; + }; + return saind; + }; + +}; + +int64 funCalcSAi(char *G, uint iL) +{ + int64 ind1=0; + for (uint iL1=0;iL1<=iL;iL1++) { + uint g=(uint) G[iL1]; + if (g>3) { + return -ind1; + } else { + ind1 <<= 2; + ind1 += g; + }; + }; + return ind1; +}; diff --git a/star-sys/STAR/source/SuffixArrayFuns.h b/star-sys/STAR/source/SuffixArrayFuns.h new file mode 100644 index 0000000..aa6a7c1 --- /dev/null +++ b/star-sys/STAR/source/SuffixArrayFuns.h @@ -0,0 +1,18 @@ +#ifndef CODE_SuffixArrayFuns +#define CODE_SuffixArrayFuns + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "PackedArray.h" +#include "Genome.h" + +uint medianUint2(uint, uint); +uint compareSeqToGenome(const Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, bool& comparRes); +uint findMultRange(const Genome &mapGen, uint i3, uint L3, uint i1, uint L1, uint i1a, uint L1a, uint i1b, uint L1b, char** s, bool dirR, uint S); +uint maxMappableLength(const Genome &mapGen, char** s, uint S, uint N, uint i1, uint i2, bool dirR, uint& L, uint* indStartEnd); +void writePacked(const Genome &mapGen, char* a, uint jj, uint x); +uint readPacked(const Genome &mapGen, char* a, uint jj); +uint suffixArraySearch1(const Genome &mapGen, char** s2, uint S, uint N, uint64 gInsert, bool dirR, uint i1, uint i2, uint L); +int64 funCalcSAi(char *G, uint iL); +uint funCalcSAiFromSA(char* gSeq, PackedArray& gSA, const Genome &mapGen, uint iSA, int L, int & iL4); +#endif diff --git a/star-sys/STAR/source/ThreadControl.cpp b/star-sys/STAR/source/ThreadControl.cpp new file mode 100644 index 0000000..4c757ca --- /dev/null +++ b/star-sys/STAR/source/ThreadControl.cpp @@ -0,0 +1,7 @@ +#include "ThreadControl.h" + +ThreadControl::ThreadControl() { + chunkInN=0; + chunkOutN=0; +// chunkOutBAMposition=new uint [MAX_chunkOutBAMposition]; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/ThreadControl.h b/star-sys/STAR/source/ThreadControl.h new file mode 100644 index 0000000..fd7c6bf --- /dev/null +++ b/star-sys/STAR/source/ThreadControl.h @@ -0,0 +1,24 @@ +#ifndef THREAD_CONTROL_DEF +#define THREAD_CONTROL_DEF + +#include "ReadAlignChunk.h" +#include + +#define MAX_chunkOutBAMposition 100000 + +class ThreadControl { +public: + bool threadBool; + + pthread_t *threadArray; + pthread_mutex_t mutexInRead, mutexOutSAM, mutexOutBAM1, mutexOutChimSAM, mutexOutChimJunction, mutexOutUnmappedFastx, mutexOutFilterBySJout; + pthread_mutex_t mutexStats, mutexLogMain, mutexBAMsortBins, mutexError; + + uint chunkInN,chunkOutN; + + ThreadControl(); + +}; + +#endif + diff --git a/star-sys/STAR/source/TimeFunctions.cpp b/star-sys/STAR/source/TimeFunctions.cpp new file mode 100644 index 0000000..c89e3a1 --- /dev/null +++ b/star-sys/STAR/source/TimeFunctions.cpp @@ -0,0 +1,20 @@ +#include +#include + +std::string timeMonthDayTime() { + time_t rawTime; + char timeChar[100]; + time(&rawTime); + strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime)); + std::string timeString=timeChar; + timeString.erase(timeString.end()-1,timeString.end()); + return timeString; +}; + +std::string timeMonthDayTime(time_t &rawTime) { + char timeChar[100]; + strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime)); + std::string timeString=timeChar; + timeString.erase(timeString.end()-1,timeString.end()); + return timeString; +}; diff --git a/star-sys/STAR/source/TimeFunctions.h b/star-sys/STAR/source/TimeFunctions.h new file mode 100644 index 0000000..5512567 --- /dev/null +++ b/star-sys/STAR/source/TimeFunctions.h @@ -0,0 +1,9 @@ +#ifndef TIME_FUNCTIONS_DEF +#define TIME_FUNCTIONS_DEF +#include +#include + +string timeMonthDayTime(); +string timeMonthDayTime(time_t &rawTime); + +#endif diff --git a/star-sys/STAR/source/Transcript.cpp b/star-sys/STAR/source/Transcript.cpp new file mode 100644 index 0000000..d7f63b5 --- /dev/null +++ b/star-sys/STAR/source/Transcript.cpp @@ -0,0 +1,52 @@ +#include "Transcript.h" + +Transcript::Transcript() +{ + reset(); +}; + +void Transcript::reset() { + extendL=0; + +// for (uint ii=0;ii<4;ii++) { +// polyXlength[ii]=0; +// polyXnMM[ii]=0; +// }; + primaryFlag=false; + + rStart=0; roStart=0; rLength=0; gStart=0; gLength=0; //read and genomic coordinates + + maxScore=0; + nMatch=0; + nMM=0; + + nGap=0; lGap=0; lDel=0; lIns=0; nDel=0; nIns=0; + + nUnique=nAnchor=0; +}; + +void Transcript::add(Transcript *trIn) { + maxScore+=trIn->maxScore; + nMatch+=trIn->nMatch; + nMM+=trIn->nMM; + nGap+=trIn->nGap; lGap+=trIn->lGap; + lDel+=trIn->lDel; nDel+=trIn->nDel; + lIns+=trIn->lIns; nIns+=trIn->nIns; + nUnique+=trIn->nUnique; +}; + +void Transcript::extractSpliceJunctions(vector> &sjOut, bool &annotYes) +{ + annotYes=true; + for (uint64 iex=0; iex=0) {//only record junctions, not indels or mate gap + array sj; + sj[0]=exons[iex][EX_G]+exons[iex][EX_L];//start + sj[1]=exons[iex+1][EX_G] - sj[0]; //gap + sjOut.push_back(sj); + if (sjAnnot[iex]==0) + annotYes=false;//if one of the SJs is unannoated, annotYes=false + }; + }; +}; + diff --git a/star-sys/STAR/source/Transcript.h b/star-sys/STAR/source/Transcript.h new file mode 100644 index 0000000..d9c46fe --- /dev/null +++ b/star-sys/STAR/source/Transcript.h @@ -0,0 +1,71 @@ +#ifndef CODE_Transcript +#define CODE_Transcript + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Variation.h" +#include "Genome.h" + +class Transcript { +public: + uint exons[MAX_N_EXONS][EX_SIZE]; //coordinates of all exons: r-start, g-start, length + uint shiftSJ[MAX_N_EXONS][2]; //shift of the SJ coordinates due to genomic micro-repeats + int canonSJ[MAX_N_EXONS]; //canonicity of each junction + uint8 sjAnnot[MAX_N_EXONS]; //anotated or not + uint8 sjStr[MAX_N_EXONS]; //strand of the junction + + uint intronMotifs[3]; + uint8 sjMotifStrand; + + uint nExons; //number of exons in the read transcript + + //variables from ReadAlign + uint *readLengthOriginal, *readLength; + uint Lread, readLengthPairOriginal; + uint iRead; //read identifier + uint readNmates; + char *readName; + + int iFrag; //frag number of the transcript, if the the transcript contains only one frag + + //loci + uint rStart, roStart, rLength, gStart, gLength, cStart; //read, original read, and genomic start/length, chromosome start + uint Chr,Str,roStr; //chromosome and strand and original read Strand + + bool primaryFlag; + + uint nMatch;//min number of matches + uint nMM;//max number of mismatches + uint mappedLength; //total mapped length, sum of lengths of all blocks(exons) + + uint extendL; //extension length + intScore maxScore; //maximum Score + + uint nGap, lGap; //number of genomic gaps (>alignIntronMin) and their total length + uint nDel; //number of genomic deletions (ie genomic gaps) + uint nIns; //number of (ie read gaps) + uint lDel; //total genomic deletion length + uint lIns; //total genomic insertion length + + uint nUnique, nAnchor; //number of unique pieces in the alignment, number of anchor pieces in the alignment + + vector varInd; + vector varGenCoord, varReadCoord ; + vector varAllele; + + Transcript(); //resets to 0 + void reset(); //reset to 0 + void resetMapG(); // reset map to 0 + void resetMapG(uint); // reset map to 0 for Lread bases + void add(Transcript*); // add + intScore alignScore(char **Read1, char *G, const Parameters &P); + int variationAdjust(const Genome &mapGen, char *R); + string generateCigarP(); //generates CIGAR + void peOverlapSEtoPE(uint* mSta, Transcript &t); + void extractSpliceJunctions(vector> &sjOut, bool &annotYes); + +private: + +}; + +#endif diff --git a/star-sys/STAR/source/Transcript_alignScore.cpp b/star-sys/STAR/source/Transcript_alignScore.cpp new file mode 100644 index 0000000..b6bcfd7 --- /dev/null +++ b/star-sys/STAR/source/Transcript_alignScore.cpp @@ -0,0 +1,56 @@ +#include +#include "Transcript.h" + +intScore Transcript::alignScore(char **Read1, char *G, const Parameters &P) {//re-calculates score and number of mismatches + maxScore=0; + nMM=0; + nMatch=0; + char* R=Read1[roStr==0 ? 0:2]; + for (uint iex=0;iex3 || g1>3) {//nothing to do + } else if (r1==g1) {//match + ++maxScore; + ++nMatch; + } else {//mismatch + ++nMM; + --maxScore; + }; + }; + }; + for (uint iex=0;iex1) leftMate=Str; + + uint trimL=exons[0][EX_R] - (exons[0][EX_R]0) { + samStreamCIGAR << trimL << "S"; //initial trimming + }; + + for (uint ii=0;ii0) {//record gaps + uint gapG=exons[ii][EX_G]-(exons[ii-1][EX_G]+exons[ii-1][EX_L]); + + if (exons[ii][EX_G] >= (exons[ii-1][EX_G]+exons[ii-1][EX_L]) ) {// + if (canonSJ[ii-1]==-3) {//gap between mates + //soft clipping of the second mate + uint s1=readLengthOriginal[leftMate]-(exons[ii-1][EX_R]+exons[ii-1][EX_L]); + uint s2=exons[ii][EX_R]-(readLengthOriginal[leftMate]+1); + if (s1>0){ + samStreamCIGAR << s1 << "S"; + }; + samStreamCIGAR << gapG << "p"; + if (s2>0){ + samStreamCIGAR << s2 << "S"; + }; + + } else { + //it's possible to have a D or N and I for at the same time + uint gapR=exons[ii][EX_R]-exons[ii-1][EX_R]-exons[ii-1][EX_L]; //gapR>0 always + if (gapR>0){ + samStreamCIGAR << gapR << "I"; + }; + if (canonSJ[ii-1]>=0 || sjAnnot[ii-1]==1) {//junction: N + samStreamCIGAR << gapG << "N"; + } else if (gapG>0) {//deletion + samStreamCIGAR << gapG << "D"; + }; + }; + } else {//mates overlap + samStreamCIGAR << "-" << (exons[ii-1][EX_G]+exons[ii-1][EX_L]) - exons[ii][EX_G] << "p"; + }; + }; + samStreamCIGAR << exons[ii][EX_L] << "M"; + }; + + + trimL=(exons[nExons-1][EX_R] 0 ) { + samStreamCIGAR << trimL << "S"; //final trimming + }; + CIGAR=samStreamCIGAR.str(); + + return CIGAR; +}; diff --git a/star-sys/STAR/source/Transcript_variationAdjust.cpp b/star-sys/STAR/source/Transcript_variationAdjust.cpp new file mode 100644 index 0000000..0e56959 --- /dev/null +++ b/star-sys/STAR/source/Transcript_variationAdjust.cpp @@ -0,0 +1,74 @@ +#include "Transcript.h" +#include "serviceFuns.cpp" + +int Transcript::variationAdjust(const Genome &mapGen, char *R) +{ + Variation &Var=*mapGen.Var; + + if (!Var.yes) + {//no variation + return 0; + }; + + int dScore=0;//change in the score + uint nMM1=0; + + //for each block, check whether it overlaps one or more SNPs + for (uint ie=0; ie (exons[ie][EX_G], Var.snp.loci, Var.snp.N); + if (isnp>=0) + { + while ((uint)isnpVar.snp.loci[isnp]) + {//these SNPs overlap the block + varInd.push_back(isnp); //record snp index + varGenCoord.push_back(Var.snp.loci[isnp]-mapGen.chrStart[Chr]); + + varReadCoord.push_back(exons[ie][EX_R]+Var.snp.loci[isnp]-exons[ie][EX_G]); + char ntR=R[varReadCoord.back()];//nt of the read in the SNP position, already trnasformed to + genome strand + + uint8 igt; + if (ntR>3) { + igt=4; + } else { + for (igt=1; igt<3; igt++) {//1st or 2nd allele, =3 of none + if (Var.snp.nt[isnp][igt]==ntR) { + break; + }; + }; + }; + + //if (ntR == Var.snp.nt[isnp][0]) + //{//mark snp that agrees with the reference + // igt*=10; + //}; + + varAllele.push_back(igt); + + if (igt<3 && ntR != Var.snp.nt[isnp][0]) + {//non-reference allele, correct nMM and score + ++nMM1; + }; + + ++isnp; + }; + }; + }; + + #define VAR_noScoreCorrection + #ifndef VAR_noScoreCorrection + if (nMM1>0) + {//one or more mismtaches need to be corrected + uint nMMold=nMM; + alignScore(Read1, G, P); + nMM-=nMM1; + nMatch+=nMM1; + dScore=2*(nMMold-nMM);//score only changes if the number of mismatches is reduced after SNP adjustment + }; + #else + //#warning VAR_noScoreCorrection set: no variation score correction + #endif + + return dScore; +}; diff --git a/star-sys/STAR/source/Transcript_variationOutput.cpp b/star-sys/STAR/source/Transcript_variationOutput.cpp new file mode 100644 index 0000000..203221d --- /dev/null +++ b/star-sys/STAR/source/Transcript_variationOutput.cpp @@ -0,0 +1,6 @@ +#include "Transcript.h" + +void Transcript::variationOutput(Variation &Var) +{ + // +}; diff --git a/star-sys/STAR/source/Transcriptome.cpp b/star-sys/STAR/source/Transcriptome.cpp new file mode 100755 index 0000000..3f0c00f --- /dev/null +++ b/star-sys/STAR/source/Transcriptome.cpp @@ -0,0 +1,185 @@ +#include "Transcriptome.h" +#include "streamFuns.h" +#include "GlobalVariables.h" +#include "ErrorWarning.h" +#include "serviceFuns.cpp" + +Transcriptome::Transcriptome (Parameters &Pin) : P(Pin){ + + if (!P.quant.yes) + return; + + trInfoDir = P.pGe.sjdbGTFfile=="-" ? P.pGe.gDir : P.sjdbInsert.outDir; //if GTF file is given at the mapping stage, it's always used for transcript info + + ifstream &geStream = ifstrOpen(trInfoDir+"/geneInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotations.gtf option at the genome generation step or mapping step", P); + geStream >> nGe; + geID.resize(nGe); + geStream.ignore(999,'\n'); + string line1; + for (uint ii=0;ii> geID[ii]; + }; + geStream.close(); + + if ( P.quant.trSAM.yes ) {//load exon-transcript structures + //load tr and ex info + ifstream & trinfo = ifstrOpen(trInfoDir+"/transcriptInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotantions.gtf option at the genome generation step or mapping step",P); + { + line1.clear(); + getline(trinfo,line1); + istringstream stream1(line1); + stream1 >> nTr; + } + trS=new uint [nTr]; + trE=new uint [nTr]; + trEmax=new uint [nTr]; + trExI=new uint32 [nTr]; + trExN=new uint16 [nTr]; + trStr=new uint8 [nTr]; + trID.resize(nTr); + for (uint32 itr=0; itr> trID[itr] >> trS[itr] >> trE[itr] >> trEmax[itr] >> str1 >> trExN[itr] >> trExI[itr]; + trStr[itr]=str1; + + if (!trinfo.good()) { + ostringstream errOut; + errOut <<"EXITING because of FATAL GENOME INDEX FILE error: transcriptInfo.tab is corrupt, or is incompatible with the current STAR version\n"; + errOut <<"SOLUTION: re-generate genome index"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + }; + + }; + P.inOut->logMain << "Loaded transcript database, nTr="<> nEx; + exSE = new uint32 [2*nEx]; + exLenCum = new uint32 [nEx]; + for (uint32 iex=0; iex> exSE[2*iex] >> exSE[2*iex+1] >> exLenCum[iex]; //reading all elements one after another + }; + P.inOut->logMain << "Loaded exon database, nEx="<> exG.nEx; + exG.s=new uint64[exG.nEx]; + exG.e=new uint64[exG.nEx]; + exG.eMax=new uint64[exG.nEx]; + exG.str=new uint8[exG.nEx]; + exG.g=new uint32[exG.nEx]; + exG.t=new uint32[exG.nEx]; + for (uint ii=0;ii> exG.s[ii] >> exG.e[ii] >> str1 >> exG.g[ii] >> exG.t[ii]; + exG.str[ii] = (uint8) str1; + }; + exinfo.close(); + //calculate eMax + exG.eMax[0]=exG.e[0]; + for (uint iex=1;iex> exG.nEx; + + geneFull.s=new uint64[nGe]; + geneFull.e=new uint64[nGe]; + geneFull.eMax=new uint64[nGe]; + geneFull.g=new uint32[nGe]; + geneFull.str=new uint8[nGe]; + + for (uint ig=0;ig> s1 >> e1 >> str1 >> g1 >> t1; + geneFull.s[g1]=min(geneFull.s[g1],s1); + geneFull.e[g1]=max(geneFull.e[g1],e1); + geneFull.str[g1] = (uint8) str1; + }; + exinfo.close(); + + uint64 *gF=new uint64 [4*nGe]; + for (uint ii=0;ii); + + for (uint ii=0;iigeneCounts.nType; itype++) { + qOut << "\t" <geneCounts.nType; itype++){ + qOut << "\t" <geneCounts.cMulti; + }; + qOut << "\n"; + + qOut << "N_noFeature"; + for (int itype=0; itypegeneCounts.nType; itype++){ + qOut << "\t" <geneCounts.cNone[itype]; + }; + qOut << "\n"; + + qOut << "N_ambiguous"; + for (int itype=0; itypegeneCounts.nType; itype++) { + qOut << "\t" <geneCounts.cAmbig[itype]; + }; + qOut << "\n"; + + for (uint32 ig=0; iggeneCounts.nType; itype++) { + qOut << "\t" <geneCounts.gCount[itype][ig]; + }; + qOut << "\n"; + }; + qOut.close(); +}; diff --git a/star-sys/STAR/source/Transcriptome.h b/star-sys/STAR/source/Transcriptome.h new file mode 100755 index 0000000..c96ab90 --- /dev/null +++ b/star-sys/STAR/source/Transcriptome.h @@ -0,0 +1,55 @@ +#ifndef CODE_Transcriptome +#define CODE_Transcriptome + +#include + +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "Quantifications.h" + +class Transcriptome { +public: + string trInfoDir; + + vector trID, geID; //transcript/gene IDs + uint32 nTr, nGe; //number of transcript/genes + + uint *trS, *trE, *trEmax; //transcripts start,end,end-max + + uint32 nEx; //number of exons + uint16 *trExN; //number of exons per transcript + uint32 *trExI; //index of the first exon for each transcript in exSE + uint8 *trStr; //transcript strand + uint32 *exSE; //exons start/end + uint32 *exLenCum; //cumulative length of previous exons + + struct {//exon-gene structure for GeneCounts + uint64 nEx;//number of exons/genes + uint64 *s,*e, *eMax; //exon start/end + uint8 *str; //strand + uint32 *g, *t; //gene/transcript IDs + } exG; + + struct {//geneFull structure + uint64 *s, *e, *eMax; + uint8 *str; + uint32 *g; + } geneFull; + + Quantifications *quants; + + //methods: + Transcriptome (Parameters &Pin); //create transcriptome structure, load and initialize parameters + uint32 quantAlign (Transcript &aG, Transcript *aTall, vector &readTranscripts, set &readGene);//transform coordinates for all aligns from genomic in RA to transcriptomic in RAtr + void geneCountsAddAlign(uint nA, Transcript **aAll, vector &gene1); //add one alignment to gene counts + void quantsAllocate(); //allocate quants structure + void quantsOutput(); //output quantification files + void geneFullAlignOverlap(uint nA, Transcript **aAll, int32 strandType, set &geneOverlap); + +private: + Parameters &P; //normal "genomic" parameters + +}; + +#endif diff --git a/star-sys/STAR/source/Transcriptome_geneCountsAddAlign.cpp b/star-sys/STAR/source/Transcriptome_geneCountsAddAlign.cpp new file mode 100644 index 0000000..1850257 --- /dev/null +++ b/star-sys/STAR/source/Transcriptome_geneCountsAddAlign.cpp @@ -0,0 +1,63 @@ +#include "Transcriptome.h" +#include "serviceFuns.cpp" + +void Transcriptome::geneCountsAddAlign(uint nA, Transcript **aAll, vector &gene1) { + + gene1.assign(quants->geneCounts.nType,-1); + + if (nA>1) { + quants->geneCounts.cMulti++; + } else { + Transcript& a=*aAll[0];//one unique alignment only + + int64 e1=-1; + + for (int ib=a.nExons-1; ib>=0; ib--) {//scan through all blocks of the alignments + + uint64 g1=a.exons[ib][EX_G]+a.exons[ib][EX_L]-1;//end of the block + +// if ((uint)ib==a.nExons-1) +// {//binary search for the first time: end of the block among the starts of exons + e1=binarySearch1a(g1, exG.s, (int32) exG.nEx); +// } else +// {//simple backwards scan +// while (e1>=0 && exG.s[e1]>g1) +// {//stop when exon start is less than block end +// --e1; +// }; +// }; + + while (e1>=0 && exG.eMax[e1]>=a.exons[ib][EX_G]) {//these exons may overlap this block + if (exG.e[e1]>=a.exons[ib][EX_G]) {//this exon overlaps the block + uint str1=(uint)exG.str[e1]-1; + for (int itype=0; itypegeneCounts.nType; itype++) { + //str1<2 (i.e. strand=0) requirement means that genes w/o strand will accept reads from both strands + if ( itype==1 && a.Str!=str1 && str1<2) continue; //same strand + if ( itype==2 && a.Str==str1 && str1<2) continue; //reverse strand + + if (gene1.at(itype)==-1) {//first gene overlapping this read + gene1[itype]=exG.g[e1]; + } else if (gene1.at(itype)==-2) { + continue;//this align was already found to be ambig for this strand + } else if (gene1.at(itype)!=(int32)exG.g[e1]) {//another gene overlaps this read + gene1[itype]=-2;//mark ambiguous + };//otherwise it's the same gene + }; + }; + --e1;// go to the previous exon + }; + }; + + for (int itype=0; itypegeneCounts.nType; itype++) { + if (gene1.at(itype)==-1) { + quants->geneCounts.cNone[itype]++; + } else if (gene1.at(itype)==-2) { + quants->geneCounts.cAmbig[itype]++; + } else { + quants->geneCounts.gCount[itype][gene1.at(itype)]++; + }; + }; + }; +}; + + diff --git a/star-sys/STAR/source/Transcriptome_geneFullAlignOverlap.cpp b/star-sys/STAR/source/Transcriptome_geneFullAlignOverlap.cpp new file mode 100644 index 0000000..5dd69d3 --- /dev/null +++ b/star-sys/STAR/source/Transcriptome_geneFullAlignOverlap.cpp @@ -0,0 +1,28 @@ +#include "Transcriptome.h" +#include "serviceFuns.cpp" + +void Transcriptome::geneFullAlignOverlap(uint nA, Transcript **aAll, int32 strandType, set &geneOverlap) +{ + for (uint32 iA=0; iA=0; ib--) {//scan through all blocks of the alignments + + uint64 be1=a.exons[ib][EX_G]+a.exons[ib][EX_L]-1;//end of the block + gi1=binarySearch1a(be1, geneFull.s, (int32) nGe); + + while (gi1>=0 && geneFull.eMax[gi1]>=a.exons[ib][EX_G]) {//these exons may overlap this block + if (geneFull.e[gi1]>=a.exons[ib][EX_G]) {//this gene overlaps the block + int32 str1 = geneFull.str[gi1]==1 ? a.Str : 1-a.Str; + if (strandType==-1 || strandType==str1) + geneOverlap.insert(geneFull.g[gi1]); + }; + --gi1;// go to the previous gene + }; + }; + }; +}; + + diff --git a/star-sys/STAR/source/Transcriptome_quantAlign.cpp b/star-sys/STAR/source/Transcriptome_quantAlign.cpp new file mode 100644 index 0000000..d318637 --- /dev/null +++ b/star-sys/STAR/source/Transcriptome_quantAlign.cpp @@ -0,0 +1,115 @@ +#include "Transcriptome.h" +#include "ReadAlign.h" +#include "serviceFuns.cpp" + + +int alignToTranscript(Transcript &aG, uint trS1, uint8 trStr1, uint32 *exSE1, uint32 *exLenCum1, uint16 exN1, Transcript &aT) { + + //find exon that overlaps beginning of the read + uint32 g1=aG.exons[0][EX_G]-trS1;//start of the transcript + uint32 ex1=binarySearch1(g1, exSE1, 2*exN1); + if (ex1>=2*exN1) return 0; //align start is to the right of all exons + + if (ex1%2==1) {//beginning of the read >=end of an exon + if (exSE1[ex1]==g1) {//first base of the read is exactly the last base of the exon + --ex1; + } else { + return 0;//beginning of the read is past the end of an exon, align does not belong to this transcript + }; + }; + ex1=ex1/2; //this is the first exon of the alignment + + aT.nExons=0; + aT.primaryFlag=false; + + aG.canonSJ[aG.nExons-1]=-999; //marks the last exons + for (uint32 iab=0; iabexSE1[2*ex1+1]+trS1+1) {//block extends past exon end + return 0; + }; + + if (iab==0 || aG.canonSJ[iab-1]<0) { + aT.exons[aT.nExons][EX_R]=aG.exons[iab][EX_R]; + aT.exons[aT.nExons][EX_G]=aG.exons[iab][EX_G]-trS1-exSE1[2*ex1]+exLenCum1[ex1]; + aT.exons[aT.nExons][EX_L]=aG.exons[iab][EX_L]; + aT.exons[aT.nExons][EX_iFrag]=aG.exons[iab][EX_iFrag]; + if (aT.nExons>0) aT.canonSJ[aT.nExons-1]=aG.canonSJ[iab-1]; + ++aT.nExons; + } else { + aT.exons[aT.nExons-1][EX_L]+=aG.exons[iab][EX_L]; + }; + switch (aG.canonSJ[iab]) { + case -999: //last exon + if (trStr1==2) {//convert align coordinates if on the -strand + uint32 trlength=exLenCum1[exN1-1]+exSE1[2*exN1-1]-exSE1[2*exN1-2]+1; //transcript length + for (uint32 iex=0; iex(aG.exons[iab+1][EX_G]-trS1, exSE1, 2*exN1); + if (ex1%2==1) {//beginning of the mext mate in the middle of the exon? + return 0; //align does not belong to this transcript + } else { + ex1=ex1/2; //this is the first exon of the second mate + }; + break; + case -2: //insertion + break; + case -1: //deletion + break; + default://junctions + if ( aG.exons[iab][EX_G]+aG.exons[iab][EX_L]==exSE1[2*ex1+1]+trS1+1 && aG.exons[iab+1][EX_G]==exSE1[2*(ex1+1)]+trS1 ) { + //junction matches transcript junction + ++ex1; + } else { + return 0; + }; + }; + }; + return 0; //this should not happen +}; + +uint32 Transcriptome::quantAlign (Transcript &aG, Transcript *aTall, vector &/*readTranscripts*/, set &/*readTrGenes*/) { + uint32 nAtr=0; //number of alignments to the transcriptome + + //binary search through transcript starts + uint32 tr1=binarySearch1a(aG.exons[0][EX_G], trS, nTr); + if (tr1==(uint32) -1) return 0; //alignment outside of range of all transcripts + + uint aGend=aG.exons[aG.nExons-1][EX_G]; + + ++tr1; + do {//cycle back through all the transcripts + --tr1; + if (aGend<=trE[tr1]) {//this transcript contains the read + int aStatus=alignToTranscript(aG, trS[tr1], trStr[tr1], exSE+2*trExI[tr1], exLenCum+trExI[tr1], trExN[tr1], aTall[nAtr]); + if (aStatus==1) {//align conforms with the transcript + aTall[nAtr].Chr = tr1; + aTall[nAtr].Str = trStr[tr1]==1 ? aG.Str : 1-aG.Str; //TODO strandedness + ++nAtr; + }; + }; + } while (trEmax[tr1]>=aGend && tr1>0); + + return nAtr; +}; diff --git a/star-sys/STAR/source/VERSION b/star-sys/STAR/source/VERSION new file mode 100644 index 0000000..6fff7bf --- /dev/null +++ b/star-sys/STAR/source/VERSION @@ -0,0 +1 @@ +#define STAR_VERSION "2.7.2a" diff --git a/star-sys/STAR/source/Variation.cpp b/star-sys/STAR/source/Variation.cpp new file mode 100644 index 0000000..51b7f48 --- /dev/null +++ b/star-sys/STAR/source/Variation.cpp @@ -0,0 +1,151 @@ +#include "Variation.h" +#include "streamFuns.h" +#include "SequenceFuns.h" +#include "TimeFunctions.h" +#include "serviceFuns.cpp" +#include "ErrorWarning.h" + +Variation::Variation (Parameters &Pin, vector &chrStartIn, map &chrNameIndexIn) : P(Pin), chrStart(chrStartIn), chrNameIndex(chrNameIndexIn) { + if (!P.var.yes) { + yes=false; + return; + }; + + yes=true; + + //not used yet + //varOutFileName=P.outFileNamePrefix+"Variation.out"; + //varOutStream.open(varOutFileName); + + vcfFile=P.var.vcfFile; + loadVCF(vcfFile); + +}; + +void scanVCF(ifstream& vcf, Parameters& P, SNP& snp, vector &chrStart, map &chrNameIndex) { + snp.N=0; + uint nlines=0; + while (true) { + string chr,id, ref, alt, dummy, sample; + uint pos; + nlines++; + + vcf >> chr; + if (!vcf.good()) { + break; + }; + + if (chr.at(0)!='#') { + vcf >> pos >> id >> ref >> alt >> dummy >> dummy >> dummy >> dummy >> sample; + + vector altV(3); + + if (ref.size()==1 && splitString(alt,',',altV)==1) {//only SNVs allowed - ref=1-char, alt could be comma separated list of 1-char. splitString returns the max lenght of the split strings + altV.insert(altV.begin(),ref);//add ref to the beginning + + if (chrNameIndex.count(chr)==0) {//chr not in Genome + P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": chromosome '"<3 && sample.at(3)!=':') { + P.inOut->logMain << "WARNING: while processing varVCFfile file=" << P.var.vcfFile <<": more than 2 alleles per sample for line number "< nt1; + nt1[0]=convertNt01234( ref.at(0) ); + nt1[1]=convertNt01234( altV.at( atoi(&sample.at(0)) ).at(0) ); + nt1[2]=convertNt01234( altV.at( atoi(&sample.at(2)) ).at(0) ); + if (nt1[0]<4 && nt1[1]<4 && nt1[2]<4) {//only record if variant is ACGT + snp.lociV.push_back(pos-1+chrStart[chrNameIndex[chr]]); + snp.nt.push_back(nt1); + snp.N++; + }; + }; + }; + }; + getline(vcf,dummy); + }; +}; + + +void Variation::loadVCF(string fileIn) { + time_t rawTime; + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <logStdOut << timeMonthDayTime(rawTime) <<" ..... loading variations VCF\n" <logMain << timeMonthDayTime(rawTime) <<" ..... Loaded VCF data, found "<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + + uint *s1=new uint[2*snp.N]; + for (uint ii=0;ii > nt1=snp.nt; + for (uint ii=0;iilogMain << timeMonthDayTime(rawTime) <<" ..... Finished sorting VCF data"<>> &snpV) { + int32 isnp=binarySearch1b (blockStart, loci, N); + while ((uint)isnp snp1; + snp1[0]=(int) (loci[isnp]-blockStart)+blockShift; + snp1[1]=(int) nt[isnp][ii+1]; + snpV[ii].push_back(snp1); + }; + }; + ++isnp; + }; +}; + +vector>> Variation::sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1) { + vector>> snpV(2); + + if (!yes) {//no variation, return 1 empty element + vector>> snpV1(1); + return snpV1; + }; + + snp.snpOnBlocks(sjStart-sjdbOverhang1, sjdbOverhang1, 0, snpV); + snp.snpOnBlocks(sjEnd+1, sjdbOverhang1, sjdbOverhang1, snpV); + + if (snpV.at(0).empty() && snpV.at(1).empty()) { + snpV.pop_back(); + } else if (snpV.at(0) == snpV.at(1)) { + snpV.pop_back(); + }; + + return snpV; +}; diff --git a/star-sys/STAR/source/Variation.h b/star-sys/STAR/source/Variation.h new file mode 100644 index 0000000..dd7ac8c --- /dev/null +++ b/star-sys/STAR/source/Variation.h @@ -0,0 +1,53 @@ +#ifndef CODE_Variation +#define CODE_Variation + +#include "IncludeDefine.h" +#include "Parameters.h" +#include + +// struct SNPnt +// { +// char ref; +// char a1; +// char a2; +// }; + +class SNP +{ +public: + uint32 N; //number of snps + uint* loci; //snp coordinates + vector lociV; //snp coordinates vector +// SNPnt* nt; //reference and alternative bases +// char **nt; //reference and alternative bases +// char *nt1; //1D array to store nt + vector> nt;//reference and alternative bases + + //methods + void snpOnBlocks(uint blockStart, uint blockL, int blockShift, vector>> &snpV); +}; + +class Variation +{ +public: + //methods + Variation (Parameters &Pin, vector &chrStart, map &chrNameIndex); //create transcriptome structure, load and initialize parameters + void loadVCF(string fileIn); //load VCF file + vector>> sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1); //calculates snp loci in sjdb sequences + + //variables + bool yes; + SNP snp; + + Parameters &P; //TODO: make this private + +private: + string vcfFile; + //string varOutFileName; + //ofstream varOutStream;//output file for variations + + vector &chrStart; //this needs to be replaced with a structure that contains basic genome variables + map &chrNameIndex; +}; + +#endif diff --git a/star-sys/STAR/source/alignSmithWaterman.cpp b/star-sys/STAR/source/alignSmithWaterman.cpp new file mode 100644 index 0000000..e7b9673 --- /dev/null +++ b/star-sys/STAR/source/alignSmithWaterman.cpp @@ -0,0 +1,152 @@ +#include "IncludeDefine.h" +#include "Transcript.h" +// local alignment with Smith-Waterman algorithm +intSWscore alignSmithWaterman(char *R, uint rL, char *G, uint gL, \ + intSWscore pMatch, intSWscore pMismatch, intSWscore pGapOpen, intSWscore pGapExtend, \ + char* T, uint Tsize, Transcript &trA) { + + intSWscore *H=new intSWscore[rL+1]; + + uint rL1=rL+1; + if (rL1*(gL+1)>Tsize) return (intSWscore) 0; + + intSWscore *E=new intSWscore[rL1]; + + memset(H,0,sizeof(H[0])*(rL1)); + memset(E,0,sizeof(E[0])*(rL1)); + + + + intSWscore maxH=0; + uint maxHr=0, maxHg=0; + + for (uint ig=1;ig<=gL;ig++) {//cycle over colums + intSWscore F=(intSWscore) 0; + intSWscore HdiagPrev=0; + for (uint ir=1;ir<=rL;ir++) {//cycle over rows + + E[ir]=max( E[ir]-pGapExtend, H[ir]-pGapOpen ); + E[ir]=max( E[ir], (intSWscore) 0 ); + + F = max( F-pGapExtend, H[ir-1]-pGapOpen ); + F = max( F, (intSWscore) 0); + + intSWscore Hdiag = G[ig-1]==R[ir-1] ? HdiagPrev+pMatch : HdiagPrev-pMismatch; + +// if (H[ir]>E[ir] & H[ir]>F) + + HdiagPrev=H[ir]; + + if (F>Hdiag && F>E[ir]) {//insertion (gap in read) + H[ir]=F; + T[ir+ig*rL1]=1; + } else if (Hdiag>F && Hdiag>E[ir]) {//match-mismatch + H[ir]=Hdiag; + T[ir+ig*rL1]=2; + } else {//deletion (gap in genome) + H[ir]=E[ir]; + T[ir+ig*rL1]=3; + }; + + if (H[ir]<0) { + H[ir]=0; + }; + + if (H[ir]==0) { + T[ir+ig*rL1]=0; + }; + +// Hdiag=max(Hdiag,E[ir]); +// Hdiag=max(Hdiag,F); +// H[ir]=max(Hdiag,(intSWscore) 0); + + if (H[ir]>maxH) { + maxH=H[ir]; + maxHr=ir; + maxHg=ig; + }; + #ifdef DEBUG_SW + stdOut << setw(2)<0 && ir>0 && ig>0) { + if (T[ir+ig*rL1]==2) { + if (prevOper==2) {//increase length + trA.exons[trA.nExons][EX_L]++; + } else {//new exon + ++trA.nExons; + trA.exons[trA.nExons][EX_L]=1; + trA.exons[trA.nExons][EX_R]=ir; + trA.exons[trA.nExons][EX_G]=ig; + prevOper=2; + }; + --ir; + --ig; + } else if (T[ir+ig*rL1]==1) {//gap in read + --ir; + prevOper=1; + } else {//gap in genome + --ig; + prevOper=3; + }; + }; + + ++trA.nExons; + for (uint ii=0;ii=1 && ig>=1) { +// +// rMap[ir-1]=ig; +// +// }; + + return maxH; +}; diff --git a/star-sys/STAR/source/alignSmithWaterman.h b/star-sys/STAR/source/alignSmithWaterman.h new file mode 100644 index 0000000..e699541 --- /dev/null +++ b/star-sys/STAR/source/alignSmithWaterman.h @@ -0,0 +1,2 @@ +#include "IncludeDefine.h" +intSWscore alignSmithWaterman(char *R, uint rL, char *G, uint gL, intSWscore pMatch, intSWscore pMismatch, intSWscore pGapOpen, intSWscore pGapExtend, char* T, uint Tsize, Transcript &trA); diff --git a/star-sys/STAR/source/bamRemoveDuplicates.cpp b/star-sys/STAR/source/bamRemoveDuplicates.cpp new file mode 100644 index 0000000..ef8fc64 --- /dev/null +++ b/star-sys/STAR/source/bamRemoveDuplicates.cpp @@ -0,0 +1,251 @@ +#include +#include "bamRemoveDuplicates.h" +#include +#include "htslib/htslib/sam.h" +#include "IncludeDefine.h" +#include SAMTOOLS_BGZF_H +#include "ErrorWarning.h" + +#define compareReturn(a,b) if(a>b) {return 1;} else if (a>24; + uint32 lb=(pb[3]<<24)>>24; + + compareReturn(la,lb) else { + char* ca=(char*) (pa+9); + char* cb=(char*) (pb+9); + for (uint32 ii=0;ii>16; + uint32 fb=pb[4]>>16; + + compareReturn((fa&0x80), (fb&0x80)); + return 0; + }; +}; + +uint32 funStartExtendS(const uint32* const p) {//calculates align start extending right S operation + uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24)); + if ( ((cig[0]<<28)>>28) == 4 ) {//first (right) operation is S + return p[2]-(cig[0]>>4); + } else { + return p[2]; + }; +}; + +uint32 funCigarExtendS(const uint32* const p, uint32* cout) { + uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24)); + uint32 n=(p[4]<<16)>>16, n1=n; + + if (((cig[0]<<28)>>28) == 4) { + --n1; + memcpy((char*) cout, (char*) (cig+1), n1*sizeof(uint32));//copy CIGAR starting from the 2nd operation + cout[0]+=(cig[0]>>4)<<4; + } else { + memcpy((char*) cout, (char*) cig, n*sizeof(uint32));//copy full CIGAR + }; + if (((cig[n-1]<<28)>>28) == 4) {//remove last S opeartion add length to previous M + --n1; + cout[n1-1]+=(cig[n-1]>>4)<<4; + }; + return n1; +}; + +int funCompareCigarsExtendS(const uint32* const pa, const uint32* const pb){ + uint32 ca[100], cb[100]; + uint32 na=funCigarExtendS(pa,ca); + uint32 nb=funCigarExtendS(pb,cb); + compareReturn(na,nb); + for (uint32 ii=0; ii>16,pb1[4]>>16);//FLAG match + compareReturn(pa2[4]>>16,pb2[4]>>16);//FLAG match - 2nd mate + + int ret1=funCompareCigarsExtendS(pa1,pb1); + if (ret1!=0) return ret1; + ret1=funCompareCigarsExtendS(pa2,pb2); + if (ret1!=0) return ret1; + + //compare sequences + uint8_t* sa=((uint8_t*) pa2)+9*4+((pa2[3]<<24)>>24)+((pa2[4]<<16)>>16)*4; + uint8_t* sb=((uint8_t*) pb2)+9*4+((pb2[3]<<24)>>24)+((pb2[4]<<16)>>16)*4; + if (((pa2[4]>>16) & 0x10) == 0) {//not reverse complemented + uint ii=1; + for (; ii0) { + compareReturn((sa[ii/2]>>4),(sb[ii/2]>>4)); + }; + } else { + uint32 ii=pa2[5]-g_bamRemoveDuplicatesMate2basesN; + if (ii%2>0) { + compareReturn((sa[ii/2]&15),(sb[ii/2]&15)); + ++ii; + }; + for (; iibamLength) {//reached end of loaded BAM block, add BAM data + if (bamLength0) {//reached end of BAM file, cannot load more + bamFileEnd=true; + } else { + if (bamS==0 && bamLength>0) {//TODO + ostringstream errOut; + errOut <<"EXITING because of fatal ERROR: not enough memory for marking duplicates \n"; + errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <logMain, EXIT_CODE_PARAMETER, P); + }; + + //write out processed block + bgzf_write(bgzfOut,bamRaw,bamS); + + bamLength-=bamS; + memmove(bamRaw, bamRaw+bamS,bamLength); //move the non-processed part of the block to the beginning of bamRaw + bamLength+=bgzf_read(bamIn, bamRaw+bamLength, bamLengthMax-bamLength);//marks the end of the BAM block that has been read + //restart search for the group + bamS=0; + bamE=0; + bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment + rightMax=0; + grN=0; + }; + }; + + int nMult=0; + uint32 chrE=0; + uint32 leftE=0; + uint32 rightE=0; + uint32 chrS=0; + + if (!bamFileEnd) + { + uint32* bamP=(uint32*) (bamRaw+bamE);//pointer to the 1st mate of the pair + + bamA->data=((uint8_t*) bamP)+9*4+((bamP[3]<<24)>>24)+((bamP[4]<<16)>>16)*4+(bamP[5]+1)/2+bamP[5];//add length for: core, name, cigar, seq, qual + bamA->l_data=((uint8_t*) bamP)+bamP[0]+1-bamA->data; + + nMult=bam_aux2i(bam_aux_get(bamA,"NH")); + + if (nMult==1 || (nMult>1 && P.removeDuplicates.markMulti)) + { + bamP[4] |= (0x400<<16);//mark all aligns as duplicate, will unmark. If multimappers, onyl mark if markMult=true + }; + + chrE=bamP[1]; + leftE=bamP[2]; + rightE=bamP[7]; + + chrS=*(uint32*)(bamRaw+bamS+4*1); + }; + + if ( chrE !=chrS || (rightMax>0 && leftE>rightMax) || bamFileEnd ) {//found new group of reads to be processed, start collapsing procedure + qsort((void*) aD, grN, sizeof(uint), funCompareNames); + qsort((void*) aD, grN/2, 2*sizeof(uint), funCompareCoordFlagCigarSeq); + //go through the list and select non-duplicates + int bScore=-999, bP=0; + for (uint pp=0; ppdata=((uint8_t*) bamP1)+9*4+((bamP1[3]<<24)>>24)+((bamP1[4]<<16)>>16)*4+(bamP1[5]+1)/2+bamP1[5];//add length for: core, name, cigar, seq, qual + bamA->l_data=((uint8_t*) bamP1)+bamP1[0]+1-bamA->data; + int score1=bam_aux2i(bam_aux_get(bamA,"AS")); + if (score1>bScore) { + bScore=score1; + bP=pp; + }; + + if ( pp==(grN/2-1) || funCompareCoordFlagCigarSeq((void*) (aD+pp*2),(void*) (aD+pp*2+2))!=0 ) {//next pair is not equal to the current one + //un-mark duplicates + uint32* bamPb=(uint32*) aD[bP*2+1];//pointer to the 2nd mate of the pair + bamPb[4] ^= (0x400<<16); + bamPb=(uint32*) aD[bP*2];//pointer to the 1st mate of the pair + bamPb[4] ^= (0x400<<16); + //cout << ((char*)(bamPb+9)) <<"\n"; + bScore=-999;//reset best score + }; + }; + + + //reset for the next group + if (bamFileEnd) break; //exit the main cycle over blocks + rightMax=0; + bamS=bamE; + grN=0; + }; + + if (nMult==1) {//record this alignment in the current group, unique mappers only. Multi-mappers will not be considered for collapsing, and will remain marked as duplicates + if (grN>=grNmax) {//reallocate + grNmax=grN*2; + uint *aD1=new uint[grNmax]; + memcpy((char*) aD1, (char*) aD, grN*sizeof(uint)); + delete [] aD; + aD=aD1; + cerr << "reallocated array "<leftE) {//left mate, record coordinate of its right mate + rightMax=max(rightMax, rightE); + }; + }; + + bamE=bamE1;//shift to the next record + bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment + + }; + + bgzf_write(bgzfOut,bamRaw,bamLength); + bgzf_flush(bgzfOut); + bgzf_close(bgzfOut); +}; diff --git a/star-sys/STAR/source/bamRemoveDuplicates.h b/star-sys/STAR/source/bamRemoveDuplicates.h new file mode 100644 index 0000000..79fccbe --- /dev/null +++ b/star-sys/STAR/source/bamRemoveDuplicates.h @@ -0,0 +1,10 @@ +#ifndef CODE_bamRemoveDuplicates +#define CODE_bamRemoveDuplicates +#include +#include "Parameters.h" + +using namespace std; + +void bamRemoveDuplicates(const string bamFileName, const string bamFileNameOut, Parameters &P); + +#endif diff --git a/star-sys/STAR/source/bam_cat.c b/star-sys/STAR/source/bam_cat.c new file mode 100644 index 0000000..944cafb --- /dev/null +++ b/star-sys/STAR/source/bam_cat.c @@ -0,0 +1,143 @@ +/* + +bam_cat -- efficiently concatenates bam files + +bam_cat can be used to concatenate BAM files. Under special +circumstances, it can be used as an alternative to 'samtools merge' to +concatenate multiple sorted files into a single sorted file. For this +to work each file must be sorted, and the sorted files must be given +as command line arguments in order such that the final read in file i +is less than or equal to the first read in file i+1. + +This code is derived from the bam_reheader function in samtools 0.1.8 +and modified to perform concatenation by Chris Saunders on behalf of +Illumina. + +2014-06-27: Alex Dobin modified the code slighlty: +* to compile with only htslib, no need for samtools package +* removed the samtools interface function (main) +* added header file "bam_cat.h" + + +########## License: + +The MIT License + +Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +Modified SAMtools work copyright (c) 2010 Illumina, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + +#include "bam_cat.h" + +#include +#include +#include + +#include "htslib/htslib/bgzf.h" +#include "htslib/htslib/sam.h" +#include + +#define BUF_SIZE 0x10000 + +#define GZIPID1 31 +#define GZIPID2 139 + +#define BGZF_EMPTY_BLOCK_SIZE 28 + + +int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) +{ + BGZF *fp; + uint8_t *buf; + uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; + const int es=BGZF_EMPTY_BLOCK_SIZE; + int i; + + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); + if (fp == 0) { + fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); + return 1; + } + if (h) bam_hdr_write(fp, h); + + buf = (uint8_t*) malloc(BUF_SIZE); + for(i = 0; i < nfn; ++i){ + BGZF *in; + bam_hdr_t *old; + int len,j; + + in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + return -1; + } + if (in->is_write) return -1; + + old = bam_hdr_read(in); + if (h == 0 && i == 0) bam_hdr_write(fp, old); + + if (in->block_offset < in->block_length) { + bgzf_write(fp, (void*)((char*)in->uncompressed_block + in->block_offset), in->block_length - in->block_offset); + bgzf_flush(fp); + } + + j=0; + while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { + if(lenX[N-1] || xi1+1) {//binary search + i3=(i1+i2)/2; + if (X[i3]>x) { + i2=i3; + } else { + i1=i3; + }; + }; + + if (x==X[i1]) { + i3=i1; + } else if (x==X[i2]) { + i3=i2; + } else { + return -1; + }; + + for (int jj=i3;jj>=0;jj--) {//go back + if (x!=X[jj]) { + break;//next try forward + } else if (y==Y[jj]) { + return jj; + }; + }; + + for (int jj=i3;jj=re2) {//t1 block is on the right to t2, no hope of overlap + i2++; + } else if (rs2>=re1) {//t2 block is on the right to t1, no hope of overlap + i1++; + } else if (gs1-rs1 != gs2-rs2) {//no overlap + if (re1>=re2) i2++;//1 is on the right of 2 + if (re2>=re1) i1++;//2 is on the right of 1 + } else {//overlap + nOverlap += min(re1,re2) - max(rs1,rs2); + if (re1>=re2) i2++;//1 is on the right of 2 + if (re2>=re1) i1++;//2 is on the right of 1 + }; + }; + + //debug +// uint nO1=0; +// for (uint ir=0;ir0) nO1++; +// }; +// +// if (nOverlap!=nO1) { +// exit(255); +// }; +// + + return nOverlap; +}; + diff --git a/star-sys/STAR/source/blocksOverlap.h b/star-sys/STAR/source/blocksOverlap.h new file mode 100644 index 0000000..4b9d4b3 --- /dev/null +++ b/star-sys/STAR/source/blocksOverlap.h @@ -0,0 +1,10 @@ +#ifndef BLOCKS_OVERLAP_DEF +#define BLOCKS_OVERLAP_DEF + +#include "IncludeDefine.h" +#include "Transcript.h" + +uint blocksOverlap(Transcript &t1, Transcript &t2); + + +#endif diff --git a/star-sys/STAR/source/extendAlign.cpp b/star-sys/STAR/source/extendAlign.cpp new file mode 100644 index 0000000..c3a6144 --- /dev/null +++ b/star-sys/STAR/source/extendAlign.cpp @@ -0,0 +1,93 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "extendAlign.h" + +bool extendAlign( char* R, char* G, uint rStart, uint gStart, int dR, int dG, uint L, uint Lprev, uint nMMprev, uint nMMmax, double pMMmax, bool extendToEnd, Transcript* trA ) { + +// find the maximum score + +int iS,iG; + +int Score=0, nMatch=0, nMM=0; +trA->maxScore=0; + +R=R+rStart; +G=G+gStart; + +if (extendToEnd) {//end to end extension + + int iExt; + for (iExt=0;iExt<(int) L;iExt++) { + iS=dR*iExt; + iG=dG*iExt; + + if ((gStart+iG)==(uint)(-1) || G[iG]==5) {//prohibit extension through chr boundary + trA->extendL=0; + trA->maxScore=-999999999; + trA->nMatch=0; + trA->nMM=nMMmax+1; + return true; +// return false; + }; + if (R[iS]==MARK_FRAG_SPACER_BASE) break; //no extension through the spacer between fragments + + if (R[iS]>3 || G[iG]>3) continue;//no penalties for Ns in reads or genome + + if (G[iG]==R[iS]) {//Match + nMatch++; + Score += scoreMatch; + } else { + nMM++; + Score -= scoreMatch; + }; + }; + + if (iExt>0) { + trA->extendL=iExt; + trA->maxScore=Score; + trA->nMatch=nMatch; + trA->nMM=nMM; + return true; + } else { + return false; + }; + +}; + + +for (int i=0;i<(int) L;i++) { + iS=dR*i; + iG=dG*i; + + if ((gStart+iG)==(uint)(-1) || G[iG]==5 || R[iS]==MARK_FRAG_SPACER_BASE) break; //no extension through chr boundary, or through the spacer between fragments + if (R[iS]>3 || G[iG]>3) continue;//no penalties for Ns in reads or genome + + if (G[iG]==R[iS]) {//Match + nMatch++; + Score += scoreMatch; + if (Score>trA->maxScore) {//record new maximum + if (nMM+nMMprev <= min(pMMmax*double(Lprev+i+1), double(nMMmax)) ) {//check nMM, if too many mismatches - do not record this maximum. Do not break - there might be still hope to make a long extension + trA->extendL=i+1; + trA->maxScore=Score; + trA->nMatch=nMatch; + trA->nMM=nMM; + }; + }; + } else {//MM + if (nMM+nMMprev >= min(pMMmax*double(Lprev+L), double(nMMmax)) ) {//there is no hope to extend it further, break + break; + }; + + nMM++; + Score -= scoreMatch; + }; +}; + +// decide of the extension worked +bool extDone = trA->extendL > 0; + +return extDone; + +}; + diff --git a/star-sys/STAR/source/extendAlign.h b/star-sys/STAR/source/extendAlign.h new file mode 100644 index 0000000..1d07a19 --- /dev/null +++ b/star-sys/STAR/source/extendAlign.h @@ -0,0 +1,6 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" + +bool extendAlign( char* R, char* G, uint rStart, uint gStart, int dR, int dG, uint L, uint Lprev, uint nMMprev, uint nMMmax, double pMMmax, bool extendToEnd, Transcript* trA ); + diff --git a/star-sys/STAR/source/funCompareUintAndSuffixes.cpp b/star-sys/STAR/source/funCompareUintAndSuffixes.cpp new file mode 100644 index 0000000..e09ed31 --- /dev/null +++ b/star-sys/STAR/source/funCompareUintAndSuffixes.cpp @@ -0,0 +1,40 @@ +#include "funCompareUintAndSuffixes.h" + +char* g_funCompareUintAndSuffixes_G; +uint64_t g_funCompareUintAndSuffixes_L; + +int funCompareUintAndSuffixes ( const void *a, const void *b){ + uint64_t* va= ((uint64_t*) a); + uint64_t* vb= ((uint64_t*) b); + + if (va[0]>vb[0]) { + return 1; + } else if (va[0]gb[ig]) + {// second condition: reached the end of ga, it's >= than any character, but = does not matter + return 1; + } else if (ga[ig]vb[1]) + { + return 1; + } else + {//va cannot be equal to vb + return -1; + }; + } else + {//continue + ig++; + }; + }; + }; +}; diff --git a/star-sys/STAR/source/funCompareUintAndSuffixes.h b/star-sys/STAR/source/funCompareUintAndSuffixes.h new file mode 100644 index 0000000..e5419c7 --- /dev/null +++ b/star-sys/STAR/source/funCompareUintAndSuffixes.h @@ -0,0 +1,11 @@ +#ifndef CODE_funCompareUintAndSuffixes +#define CODE_funCompareUintAndSuffixes + +#include + +extern char* g_funCompareUintAndSuffixes_G; +extern uint64_t g_funCompareUintAndSuffixes_L; + +int funCompareUintAndSuffixes ( const void *a, const void *b); + +#endif diff --git a/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.cpp b/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.cpp new file mode 100644 index 0000000..4673417 --- /dev/null +++ b/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.cpp @@ -0,0 +1,33 @@ +#include "funCompareUintAndSuffixesMemcmp.h" +#include //for memcmp + +char* g_funCompareUintAndSuffixesMemcmp_G; +uint64_t g_funCompareUintAndSuffixesMemcmp_L; + +int funCompareUintAndSuffixesMemcmp ( const void *a, const void *b) +{ + uint64_t* va= ((uint64_t*) a); + uint64_t* vb= ((uint64_t*) b); + + if (va[0]>vb[0]) + { + return 1; + } else if (va[0]vb[1] ? 1 : -1; + }; +// int comp=va[1]>vb[1] ? 1 : -1; + return comp; + }; +}; diff --git a/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.h b/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.h new file mode 100644 index 0000000..83c6f92 --- /dev/null +++ b/star-sys/STAR/source/funCompareUintAndSuffixesMemcmp.h @@ -0,0 +1,10 @@ +#ifndef CODE_funCompareUintAndSuffixesMemcmp +#define CODE_funCompareUintAndSuffixesMemcmp + +#include + +extern char* g_funCompareUintAndSuffixesMemcmp_G; +extern uint64_t g_funCompareUintAndSuffixesMemcmp_L; +int funCompareUintAndSuffixesMemcmp ( const void *a, const void *b); + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/genomeGenerate.h b/star-sys/STAR/source/genomeGenerate.h new file mode 100644 index 0000000..46a867f --- /dev/null +++ b/star-sys/STAR/source/genomeGenerate.h @@ -0,0 +1,4 @@ +#include "Parameters.h" + +void genomeGenerate(Parameters &P); + diff --git a/star-sys/STAR/source/genomeParametersWrite.cpp b/star-sys/STAR/source/genomeParametersWrite.cpp new file mode 100644 index 0000000..202397c --- /dev/null +++ b/star-sys/STAR/source/genomeParametersWrite.cpp @@ -0,0 +1,38 @@ +#include "genomeParametersWrite.h" +#include "streamFuns.h" + +void genomeParametersWrite(string fileName, Parameters& P, string errorOut, Genome &mapGen) +{//write the genome information into the genomePar stream + ofstream & genomePar = ofstrOpen(fileName, errorOut, P); + + genomePar << "### "<logMain << timeMonthDayTime(rawTime) <<" ... generating Suffix Array index\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... generating Suffix Array index\n" <logMain << isa*100/mapGen.nSA << "% " << flush; + + uint SAstr=SA1[isa]; + bool dirG = (SAstr>>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome + SAstr &= mapGen.GstrandMask; + if (!dirG) SAstr=mapGen.nGenome-1-SAstr; + + uint indPref=0; + for (uint iL=0; iL < mapGen.pGe.gSAindexNbases; iL++) {//calculate index + + indPref <<= 2; + + uint g1= (uint) G[dirG ? SAstr+iL : SAstr-iL]; //reverese if (-) strand + + if (g1>3) {//if N, this suffix does not belong in SAi + for (uint iL1=iL; iL1 < mapGen.pGe.gSAindexNbases; iL1++) { + SAi1[mapGen.genomeSAindexStart[iL1]+ind0[iL1]] |= mapGen.SAiMarkNmaskC; + }; + break; + }; + + if (!dirG) g1=3-g1; //complement if (-) strand + + indPref += (uint) g1; + + if ( indPref > ind0[iL] || isa==0 ) {//new && good index, record it + SAi1[mapGen.genomeSAindexStart[iL]+indPref]=isa; + for (uint ii=ind0[iL]+1; iilogMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + };//for (uint isa=0; isalogMain << timeMonthDayTime(rawTime) <<" ... completed Suffix Array index\n" <logStdOut << timeMonthDayTime(rawTime) <<" ... completed Suffix Array index\n" <>mapGen.GstrandBit) == 0; //forward or reverse strand of the genome + SAstr &= mapGen.GstrandMask; + if (!dirG) SAstr=mapGen.nGenome-1-SAstr; + uint indPref1=0; + */ + + for (uint iL=0; iL < mapGen.pGe.gSAindexNbases; iL++) {//calculate index + /*{//testing: old way + indPref1 <<= 2; + + uint g1= (uint) G[dirG ? SAstr+iL : SAstr-iL]; //reverese if (-) strand + + if (g1>3) {//if N, this suffix does not belong in SAi + for (uint iL1=iL; iL1 < mapGen.pGe.gSAindexNbases; iL1++) { + SAi1.writePacked(mapGen.genomeSAindexStart[iL1]+ind0[iL1],SAi[mapGen.genomeSAindexStart[iL1]+ind0[iL1]] | mapGen.SAiMarkNmaskC); + }; + } else //relying on the true code to break iL cycle + { + if (!dirG) g1=3-g1; //complement if (-) strand + + indPref1 += (uint) g1; + + if ( indPref1 > ind0a[iL] || isa==0 ) {//new && good index, record it + SAi1.writePacked(mapGen.genomeSAindexStart[iL]+indPref1, isa); + for (uint ii=ind0a[iL]+1; iilogMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + }; + */ + + uint indPref = indFull >> (2*(mapGen.pGe.gSAindexNbases-1-iL)); +// if (indPref!=indPref1) +// cout<< iL <<" "<< isa <<" "<< indPref <<" "< ind0[iL] || isa==0 ) {//new && good index, record it + //testing +// if (funCalcSAiFromSA(G,SA,isa,iL+1,P)!=indPref) +// cout<< iL <<" "<< isa <<" "<< indPref <<" "<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + + }; + + //find next index not equal to the current one + funSAiFindNextIndex(G, SA, isaStep, isa, indFull, iL4, mapGen);//indFull and iL4 have been already defined at the previous step +// isa++; +// indFull=funCalcSAiFromSA(G,SA,isa,mapGen.pGe.gSAindexNbases,P,iL4); + };//isa cycle + delete [] ind0; + + }; + +void funSAiFindNextIndex(char * G, PackedArray & SA, uint isaStep, uint & isa, uint & indFull, int & iL4, Genome &mapGen) + { + uint indFullPrev=indFull; + int iL4prev=iL4; + isa+=isaStep; + while (isa=mapGen.nSA) + {//reached the end of the SA + indFull=funCalcSAiFromSA(G,SA,mapGen,mapGen.nSA-1,mapGen.pGe.gSAindexNbases,iL4); + if (indFull==indFullPrev && iL4==iL4prev) + { + isa=mapGen.nSA;//no more indices, the last one is equal to the previous + return; + }; + }; + + {//binary search + uint i1=isa-isaStep; + uint i2=min(isa,mapGen.nSA-1); + while (i1+10) + {//previous chr records exist + mapGen.chrStart.pop_back();//remove last record, it will be recorded again + N = mapGen.chrStart.back()+mapGen.chrLength.back(); + mapGen.chrLength.pop_back();//remove last record, it will be recorded again + }; + + ifstream fileIn; + for (uint ii=0;iilogMain, EXIT_CODE_INPUT_FILES, P); + }; + char cc=fileIn.peek(); + if ( !fileIn.good() ) + {// + ostringstream errOut; + errOut << "EXITING because of INPUT ERROR: could not read from genomeFastaFile: " <logMain, EXIT_CODE_INPUT_FILES, P); + }; + if (cc!='>') + { + ostringstream errOut; + errOut << "EXITING because of INPUT ERROR: the file format of the genomeFastaFile: " <'.\n"; + errOut << " Solution: check formatting of the fasta file. Make sure the file is uncompressed (unzipped).\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; while(!fileIn.eof()) {//read each file until eof + string lineIn (4096,'.'); + getline(fileIn,lineIn); + if (lineIn[0]=='>') {//new chromosome + if (!flagRun) { + istringstream lineInStream (lineIn); + lineInStream.ignore(1,' '); + string chrName1; + lineInStream >> chrName1; + mapGen.chrName.push_back(chrName1); + }; + + if (!flagRun && mapGen.chrStart.size()>0) mapGen.chrLength.push_back(N-mapGen.chrStart.at(mapGen.chrStart.size()-1)); //true length of the chr + + if (N>0) {//pad the chromosomes to bins boudnaries + N = ( (N+1)/mapGen.genomeChrBinNbases+1 )*mapGen.genomeChrBinNbases; + }; + + if (!flagRun) { + mapGen.chrStart.push_back(N); + P.inOut->logMain << mapGen.pGe.gFastaFiles.at(ii)<<" : chr # " << mapGen.chrStart.size()-1 << " \""<=32) + ++N; + }; + }; + + }; + }; + fileIn.close(); + }; + + + if (!flagRun) + mapGen.chrLength.push_back(N-mapGen.chrStart.at(mapGen.chrStart.size()-1)); //true length of the last chr + + N = ( (N+1)/mapGen.genomeChrBinNbases+1)*mapGen.genomeChrBinNbases; + + if (!flagRun) { + mapGen.nChrReal=mapGen.chrStart.size(); + mapGen.chrStart.push_back(N); //last chromosome end+1 + for (uint ii=0;ii + +CC ?= gcc +AR = ar +RANLIB = ranlib + +# TODO: edit cram code to remove need for -DSAMTOOLS +CPPFLAGS += -I. -DSAMTOOLS=1 +# TODO: probably update cram code to make it compile cleanly with -Wc++-compat +CFLAGS := -g -Wall -O2 $(CFLAGS) +EXTRA_CFLAGS_PIC = -fpic +LDFLAGS = +LDLIBS = + +prefix = /usr/local +exec_prefix = $(prefix) +bindir = $(exec_prefix)/bin +includedir = $(prefix)/include +libdir = $(exec_prefix)/lib +mandir = $(prefix)/share/man +man1dir = $(mandir)/man1 +man5dir = $(mandir)/man5 + +INSTALL = install -p +INSTALL_PROGRAM = $(INSTALL) +INSTALL_DATA = $(INSTALL) -m 644 + +BUILT_PROGRAMS = \ + bgzip \ + tabix + +BUILT_TEST_PROGRAMS = \ + test/fieldarith \ + test/hfile \ + test/sam \ + test/test_view \ + test/test-vcf-api \ + test/test-vcf-sweep + +all: lib-static lib-shared $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) + +HTSPREFIX = +include htslib_vars.mk + +lib-static: libhts.a + +# $(shell), :=, and ifeq/.../endif are GNU Make-specific. If you don't have +# GNU Make, comment out the parts of this conditional that don't apply. +PLATFORM := $(shell uname -s) +ifeq "$(PLATFORM)" "Darwin" +SHLIB_FLAVOUR = dylib +lib-shared: libhts.dylib +else +SHLIB_FLAVOUR = so +lib-shared: libhts.so +endif + + +PACKAGE_VERSION = 0.0.1 +LIBHTS_SOVERSION = 0 + + +# $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string +# even if this is a dirty or untagged Git working tree. +NUMERIC_VERSION = $(PACKAGE_VERSION) + +# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git +# description of the working tree: either a release tag with the same value +# as $(PACKAGE_VERSION) above, or an exact description likely based on a tag. +# Much of this is also GNU Make-specific. If you don't have GNU Make and/or +# are not building from a Git repository, comment out this conditional. +ifneq "$(wildcard .git)" "" +original_version := $(PACKAGE_VERSION) +PACKAGE_VERSION := $(shell git describe --always --dirty) + +# Unless the Git description matches /\d*\.\d*(\.\d*)?/, i.e., is exactly a tag +# with a numeric name, revert $(NUMERIC_VERSION) to the original version number +# written above, but with the patchlevel field bumped to 255. +ifneq "$(subst ..,.,$(subst 0,,$(subst 1,,$(subst 2,,$(subst 3,,$(subst 4,,$(subst 5,,$(subst 6,,$(subst 7,,$(subst 8,,$(subst 9,,$(PACKAGE_VERSION))))))))))))" "." +empty := +NUMERIC_VERSION := $(subst $(empty) ,.,$(wordlist 1,2,$(subst ., ,$(original_version))) 255) +endif + +# Force version.h to be remade if $(PACKAGE_VERSION) has changed. +version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) +endif + +version.h: + echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@ + + +.SUFFIXES: .c .o .pico + +.c.o: + $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< + +.c.pico: + $(CC) $(CFLAGS) $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + + +LIBHTS_OBJS = \ + kfunc.o \ + knetfile.o \ + kstring.o \ + bgzf.o \ + faidx.o \ + hfile.o \ + hfile_net.o \ + hts.o \ + sam.o \ + synced_bcf_reader.o \ + vcf_sweep.o \ + tbx.o \ + vcf.o \ + vcfutils.o \ + cram/cram_codecs.o \ + cram/cram_decode.o \ + cram/cram_encode.o \ + cram/cram_index.o \ + cram/cram_io.o \ + cram/cram_samtools.o \ + cram/cram_stats.o \ + cram/files.o \ + cram/mFILE.o \ + cram/md5.o \ + cram/open_trace_file.o \ + cram/pooled_alloc.o \ + cram/sam_header.o \ + cram/string_alloc.o \ + cram/thread_pool.o \ + cram/vlen.o \ + cram/zfio.o + + +libhts.a: $(LIBHTS_OBJS) + @-rm -f $@ + $(AR) -rc $@ $(LIBHTS_OBJS) + -$(RANLIB) $@ + + +# The target here is libhts.so, as that is the built file that other rules +# depend upon and that is used when -lhts appears in other program's recipes. +# As a byproduct invisible to make, libhts.so.NN is also created, as it is the +# file used at runtime (when $LD_LIBRARY_PATH includes the build directory). + +libhts.so: $(LIBHTS_OBJS:.o=.pico) + $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LDLIBS) -lz + ln -sf $@ libhts.so.$(LIBHTS_SOVERSION) + +# Similarly this also creates libhts.NN.dylib as a byproduct, so that programs +# when run can find this uninstalled shared library (when $DYLD_LIBRARY_PATH +# includes this project's build directory). + +libhts.dylib: $(LIBHTS_OBJS) + $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LDLIBS) -lz + ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib + + +cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h +cram_io_h = cram/cram_io.h $(cram_misc_h) +cram_misc_h = cram/misc.h cram/os.h +cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h +cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) +cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h +cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h +hfile_internal_h = hfile_internal.h $(htslib_hfile_h) + +bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h +kstring.o kstring.pico: kstring.c htslib/kstring.h +knetfile.o knetfile.pico: knetfile.c htslib/knetfile.h +hfile.o hfile.pico: hfile.c $(htslib_hfile_h) $(hfile_internal_h) +hfile_net.o hfile_net.pico: hfile_net.c $(hfile_internal_h) htslib/knetfile.h +hts.o hts.pico: hts.c version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/ksort.h +vcf.o vcf.pico: vcf.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h +sam.o sam.pico: sam.c $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h +tbx.o tbx.pico: tbx.c $(htslib_tbx_h) $(htslib_bgzf_h) htslib/khash.h +faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) htslib/khash.h htslib/knetfile.h +synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h htslib/khash_str2int.h +vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h) +vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h) +kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h + +cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h) +cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h +cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c $(cram_h) cram/os.h cram/md5.h +cram/cram_index.o cram/cram_index.pico: cram/cram_index.c $(htslib_hfile_h) $(cram_h) cram/os.h cram/zfio.h +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) $(htslib_hfile_h) +cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c $(cram_h) $(htslib_sam_h) +cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c $(cram_h) cram/os.h +cram/files.o cram/files.pico: cram/files.c $(cram_misc_h) +cram/mFILE.o cram/mFILE.pico: cram/mFILE.c cram/os.h cram/mFILE.h cram/vlen.h +cram/md5.o cram/md5.pico: cram/md5.c cram/md5.h +cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c $(cram_open_trace_file_h) $(cram_misc_h) +cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c cram/pooled_alloc.h +cram/sam_header.o cram/sam_header.pico: cram/sam_header.c $(cram_sam_header_h) cram/string_alloc.h +cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c cram/string_alloc.h +cram/thread_pool.o cram/thread_pool.pico: cram/thread_pool.c cram/thread_pool.h +cram/vlen.o cram/vlen.pico: cram/vlen.c cram/vlen.h cram/os.h +cram/zfio.o cram/zfio.pico: cram/zfio.c cram/os.h cram/zfio.h + + +bgzip: bgzip.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ bgzip.o libhts.a $(LDLIBS) -lz + +tabix: tabix.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ tabix.o libhts.a $(LDLIBS) -lz + +bgzip.o: bgzip.c $(htslib_bgzf_h) $(htslib_hts_h) +tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $(htslib_bgzf_h) $(htslib_hts_h) + + +check test: $(BUILT_TEST_PROGRAMS) + test/fieldarith test/fieldarith.sam + test/hfile + test/sam + cd test && ./test_view.pl + cd test && ./test.pl + +test/fieldarith: test/fieldarith.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/fieldarith.o libhts.a $(LDLIBS) -lz + +test/hfile: test/hfile.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/hfile.o libhts.a $(LDLIBS) -lz + +test/sam: test/sam.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/sam.o libhts.a $(LDLIBS) -lz + +test/test_view: test/test_view.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LDLIBS) -lz + +test/test-vcf-api: test/test-vcf-api.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-api.o libhts.a $(LDLIBS) -lz + +test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a + $(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LDLIBS) -lz + +test/fieldarith.o: test/fieldarith.c $(htslib_sam_h) +test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h) +test/sam.o: test/sam.c $(htslib_sam_h) htslib/kstring.h +test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h) +test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h +test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h) + + +install: installdirs install-$(SHLIB_FLAVOUR) + $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) + $(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib + $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a + $(INSTALL_DATA) *.1 $(DESTDIR)$(man1dir) + $(INSTALL_DATA) *.5 $(DESTDIR)$(man5dir) + +installdirs: + mkdir -p $(DESTDIR)$(bindir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) + +# After installation, the real file in $(libdir) will be libhts.so.X.Y.Z, +# with symlinks libhts.so (used via -lhts during linking of client programs) +# and libhts.so.NN (used by client executables at runtime). + +install-so: libhts.so installdirs + $(INSTALL_DATA) libhts.so $(DESTDIR)$(libdir)/libhts.so.$(PACKAGE_VERSION) + ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so + ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so.$(LIBHTS_SOVERSION) + +install-dylib: libhts.dylib installdirs + $(INSTALL_PROGRAM) libhts.dylib $(DESTDIR)$(libdir)/libhts.$(PACKAGE_VERSION).dylib + ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.dylib + ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.$(LIBHTS_SOVERSION).dylib + + +testclean: + -rm -f test/*.tmp test/*.tmp.* + +mostlyclean: testclean + -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + +clean: mostlyclean clean-$(SHLIB_FLAVOUR) + -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) + +distclean: clean + -rm -f TAGS + +clean-so: + -rm -f libhts.so libhts.so.* + +clean-dylib: + -rm -f libhts.dylib libhts.*.dylib + + +tags: + ctags -f TAGS *.[ch] cram/*.[ch] htslib/*.h + + +force: + + +.PHONY: all check clean distclean force install installdirs +.PHONY: lib-shared lib-static mostlyclean tags test testclean +.PHONY: clean-so install-so +.PHONY: clean-dylib install-dylib diff --git a/star-sys/STAR/source/htslib/README.md b/star-sys/STAR/source/htslib/README.md new file mode 100644 index 0000000..4afb112 --- /dev/null +++ b/star-sys/STAR/source/htslib/README.md @@ -0,0 +1,17 @@ +HTSlib is an implementation of a unified C library for accessing common file +formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing +data, and is the core library used by [samtools][2] and [bcftools][3]. +HTSlib only depends on [zlib][4]. +It is known to be compatible with gcc, g++ and clang. + +HTSlib implements a generalized BAM index, with file extension `.csi` +(coordinate-sorted index). The HTSlib file reader first looks for the new index +and then for the old if the new index is absent. + +This project also includes the popular tabix indexer, which indexes both `.tbi` +and `.csi` formats, and the bgzip compression utility. + +[1]: http://samtools.github.io/hts-specs/ +[2]: http://samtools.github.io/bcftools/ +[3]: http://github.com/samtools/samtools +[4]: http://zlib.net/ diff --git a/star-sys/STAR/source/htslib/bgzf.c b/star-sys/STAR/source/htslib/bgzf.c new file mode 100644 index 0000000..1bf799b --- /dev/null +++ b/star-sys/STAR/source/htslib/bgzf.c @@ -0,0 +1,1085 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/hts.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 + + +/* BGZF/GZIP header (speciallized from RFC 1952; little endian): + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + BGZF extension: + ^ ^ ^ ^ + | | | | + FLG.EXTRA XLEN B C + + BGZF format is compatible with GZIP. It limits the size of each compressed + block to 2^16 bytes and adds and an extra "BC" field in the gzip header which + records the size. + +*/ +static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; + +#ifdef BGZF_CACHE +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +#include "htslib/khash.h" +KHASH_MAP_INIT_INT64(cache, cache_t) +#endif + +typedef struct +{ + uint64_t uaddr; // offset w.r.t. uncompressed data + uint64_t caddr; // offset w.r.t. compressed data +} +bgzidx1_t; + +struct __bgzidx_t +{ + int noffs, moffs; // the size of the index, n:used, m:allocated + bgzidx1_t *offs; // offsets + uint64_t ublock_addr; // offset of the current block (uncompressed data) +}; + +void bgzf_index_destroy(BGZF *fp); +int bgzf_index_add_block(BGZF *fp); + +static inline void packInt16(uint8_t *buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +static inline int unpackInt16(const uint8_t *buffer) +{ + return buffer[0] | buffer[1] << 8; +} + +static inline void packInt32(uint8_t *buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static BGZF *bgzf_read_init(hFILE *hfpr) +{ + BGZF *fp; + uint8_t magic[2]; + ssize_t n = hpeek(hfpr, magic, 2); + if (n < 0) return NULL; + + fp = (BGZF*)calloc(1, sizeof(BGZF)); + if (fp == NULL) return NULL; + + fp->is_write = 0; + fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b); + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); +#ifdef BGZF_CACHE + fp->cache = kh_init(cache); +#endif + return fp; +} + +static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level, -2 plain uncompressed +{ + BGZF *fp; + fp = (BGZF*)calloc(1, sizeof(BGZF)); + fp->is_write = 1; + if ( compress_level==-2 ) + { + fp->is_compressed = 0; + return fp; + } + fp->is_compressed = 1; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; + return fp; +} +// get the compress level from the mode string +static int mode2level(const char *__restrict mode) +{ + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = -2; + return compress_level; +} + +BGZF *bgzf_open(const char *path, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r')) { + hFILE *fpr; + if ((fpr = hopen(path, mode)) == 0) return 0; + fp = bgzf_read_init(fpr); + if (fp == 0) { hclose_abruptly(fpr); return NULL; } + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'a')) { + hFILE *fpw; + if ((fpw = hopen(path, mode)) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + else { errno = EINVAL; return 0; } + + fp->is_be = ed_is_big(); + return fp; +} + +BGZF *bgzf_dopen(int fd, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r')) { + hFILE *fpr; + if ((fpr = hdopen(fd, mode)) == 0) return 0; + fp = bgzf_read_init(fpr); + if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'a')) { + hFILE *fpw; + if ((fpw = hdopen(fd, mode)) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + else { errno = EINVAL; return 0; } + + fp->is_be = ed_is_big(); + return fp; +} + +BGZF *bgzf_hopen(hFILE *hfp, const char *mode) +{ + BGZF *fp = NULL; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r')) { + fp = bgzf_read_init(hfp); + if (fp == NULL) return NULL; + } else if (strchr(mode, 'w') || strchr(mode, 'a')) { + fp = bgzf_write_init(mode2level(mode)); + } + else { errno = EINVAL; return 0; } + + fp->fp = hfp; + fp->is_be = ed_is_big(); + return fp; +} + +static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) +{ + uint32_t crc; + z_stream zs; + uint8_t *dst = (uint8_t*)_dst; + + // compress the body + zs.zalloc = NULL; zs.zfree = NULL; + zs.next_in = (Bytef*)src; + zs.avail_in = slen; + zs.next_out = dst + BLOCK_HEADER_LENGTH; + zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer + if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; + if (deflateEnd(&zs) != Z_OK) return -1; + *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + // write the header + memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block + packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes + // write the footer + crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen); + packInt32((uint8_t*)&dst[*dlen - 8], crc); + packInt32((uint8_t*)&dst[*dlen - 4], slen); + return 0; +} + +// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. +static int deflate_block(BGZF *fp, int block_length) +{ + int comp_size = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_offset = 0; + return comp_size; +} + +// Inflate the block in fp->compressed_block into fp->uncompressed_block +static int inflate_block(BGZF* fp, int block_length) +{ + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = (Bytef*)fp->uncompressed_block; + zs.avail_out = BGZF_MAX_BLOCK_SIZE; + + if (inflateInit2(&zs, -15) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { + inflateEnd(&zs); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflateEnd(&zs) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + return zs.total_out; +} + +static int inflate_gzip_block(BGZF *fp, int cached) +{ + int ret = Z_OK; + do + { + if ( !cached && fp->gz_stream->avail_out!=0 ) + { + fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE); + if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in; + if ( fp->gz_stream->avail_in==0 ) break; + fp->gz_stream->next_in = fp->compressed_block; + } + else cached = 0; + do + { + fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset; + fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset; + ret = inflate(fp->gz_stream, Z_NO_FLUSH); + if ( ret<0 ) return -1; + unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out; + if ( have ) return have; + } + while ( fp->gz_stream->avail_out == 0 ); + } + while (ret != Z_STREAM_END); + return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out; +} + +// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error +static int check_header(const uint8_t *header) +{ + if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2; + return ((header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; +} + +#ifdef BGZF_CACHE +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->is_write) return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); + if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 ) + { + // todo: move the error up + fprintf(stderr,"Could not hseek to %"PRId64"\n", p->end_offset); + exit(1); + } + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); +} +#else +static void free_cache(BGZF *fp) {} +static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} +static void cache_block(BGZF *fp, int size) {} +#endif + +int bgzf_read_block(BGZF *fp) +{ + uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; + int count, size = 0, block_length, remaining; + + // Reading an uncompressed file + if ( !fp->is_compressed ) + { + count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); + if ( count==0 ) + { + fp->block_length = 0; + return 0; + } + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address += count; + fp->block_length = count; + return 0; + } + + // Reading compressed file + int64_t block_address; + block_address = htell(fp->fp); + if ( fp->is_gzip ) + { + count = inflate_gzip_block(fp, 0); + if ( count<0 ) + { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_length = count; + fp->block_address = block_address; + return 0; + } + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; + count = hread(fp->fp, header, sizeof(header)); + if (count == 0) { // no data read + fp->block_length = 0; + return 0; + } + int ret; + if ( count != sizeof(header) || (ret=check_header(header))==-2 ) + { + fp->errcode |= BGZF_ERR_HEADER; + return -1; + } + if ( ret==-1 ) + { + // GZIP, not BGZF + uint8_t *cblock = (uint8_t*)fp->compressed_block; + memcpy(cblock, header, sizeof(header)); + count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header); + int nskip = 10; + + // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA + // Note: Some of these fields are untested, I did not have appropriate data available + if ( header[3] & 0x4 ) // FLG.FEXTRA + { + nskip += unpackInt16(&cblock[nskip]) + 2; + } + if ( header[3] & 0x8 ) // FLG.FNAME + { + while ( nskiperrcode |= BGZF_ERR_HEADER; + return -1; + } + nskip++; + } + if ( header[3] & 0x10 ) // FLG.FCOMMENT + { + while ( nskiperrcode |= BGZF_ERR_HEADER; + return -1; + } + nskip++; + } + if ( header[3] & 0x2 ) nskip += 2; // FLG.FHCRC + + fp->is_gzip = 1; + fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream)); + int ret = inflateInit2(fp->gz_stream, -15); + if (ret != Z_OK) + { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->gz_stream->avail_in = count - nskip; + fp->gz_stream->next_in = cblock + nskip; + count = inflate_gzip_block(fp, 1); + if ( count<0 ) + { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_length = count; + fp->block_address = block_address; + return 0; + } + size = count; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" + compressed_block = (uint8_t*)fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + remaining = block_length - BLOCK_HEADER_LENGTH; + count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); + if (count != remaining) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + size += count; + if ((count = inflate_block(fp, block_length)) < 0) return -1; + if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. + fp->block_address = block_address; + fp->block_length = count; + if ( fp->idx_build_otf ) + { + bgzf_index_add_block(fp); + fp->idx->ublock_addr += count; + } + cache_block(fp, size); + return 0; +} + +ssize_t bgzf_read(BGZF *fp, void *data, size_t length) +{ + ssize_t bytes_read = 0; + uint8_t *output = (uint8_t*)data; + if (length <= 0) return 0; + assert(fp->is_write == 0); + while (bytes_read < length) { + int copy_length, available = fp->block_length - fp->block_offset; + uint8_t *buffer; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) return -1; + available = fp->block_length - fp->block_offset; + if (available <= 0) break; + } + copy_length = length - bytes_read < available? length - bytes_read : available; + buffer = (uint8_t*)fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = htell(fp->fp); + fp->block_offset = fp->block_length = 0; + } + fp->uncompressed_address += bytes_read; + return bytes_read; +} + +ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) +{ + return hread(fp->fp, data, length); +} + +#ifdef BGZF_MT + +typedef struct { + struct bgzf_mtaux_t *mt; + void *buf; + int i, errcode, toproc, compress_level; +} worker_t; + +typedef struct bgzf_mtaux_t { + int n_threads, n_blks, curr, done; + volatile int proc_cnt; + void **blk; + int *len; + worker_t *w; + pthread_t *tid; + pthread_mutex_t lock; + pthread_cond_t cv; +} mtaux_t; + +static int worker_aux(worker_t *w) +{ + int i, stop = 0; + // wait for condition: to process or all done + pthread_mutex_lock(&w->mt->lock); + while (!w->toproc && !w->mt->done) + pthread_cond_wait(&w->mt->cv, &w->mt->lock); + if (w->mt->done) stop = 1; + w->toproc = 0; + pthread_mutex_unlock(&w->mt->lock); + if (stop) return 1; // to quit the thread + w->errcode = 0; + for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { + int clen = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level) != 0) + w->errcode |= BGZF_ERR_ZLIB; + memcpy(w->mt->blk[i], w->buf, clen); + w->mt->len[i] = clen; + } + __sync_fetch_and_add(&w->mt->proc_cnt, 1); + return 0; +} + +static void *mt_worker(void *data) +{ + while (worker_aux((worker_t*)data) == 0); + return 0; +} + +int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) +{ + int i; + mtaux_t *mt; + pthread_attr_t attr; + if (!fp->is_write || fp->mt || n_threads <= 1) return -1; + mt = (mtaux_t*)calloc(1, sizeof(mtaux_t)); + mt->n_threads = n_threads; + mt->n_blks = n_threads * n_sub_blks; + mt->len = (int*)calloc(mt->n_blks, sizeof(int)); + mt->blk = (void**)calloc(mt->n_blks, sizeof(void*)); + for (i = 0; i < mt->n_blks; ++i) + mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); + mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master + mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t)); + for (i = 0; i < mt->n_threads; ++i) { + mt->w[i].i = i; + mt->w[i].mt = mt; + mt->w[i].compress_level = fp->compress_level; + mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); + } + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_mutex_init(&mt->lock, 0); + pthread_cond_init(&mt->cv, 0); + for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread + pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); + fp->mt = mt; + return 0; +} + +static void mt_destroy(mtaux_t *mt) +{ + int i; + // signal all workers to quit + pthread_mutex_lock(&mt->lock); + mt->done = 1; mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread + // free other data allocated on heap + for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); + for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); + free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); + pthread_cond_destroy(&mt->cv); + pthread_mutex_destroy(&mt->lock); + free(mt); +} + +static void mt_queue(BGZF *fp) +{ + mtaux_t *mt = fp->mt; + assert(mt->curr < mt->n_blks); // guaranteed by the caller + memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); + mt->len[mt->curr] = fp->block_offset; + fp->block_offset = 0; + ++mt->curr; +} + +static int mt_flush_queue(BGZF *fp) +{ + int i; + mtaux_t *mt = fp->mt; + // signal all the workers to compress + pthread_mutex_lock(&mt->lock); + for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; + mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + // worker 0 is doing things here + worker_aux(&mt->w[0]); + // wait for all the threads to complete + while (mt->proc_cnt < mt->n_threads); + // dump data to disk + for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; + for (i = 0; i < mt->curr; ++i) + if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) { + fp->errcode |= BGZF_ERR_IO; + break; + } + mt->curr = 0; + return (fp->errcode == 0)? 0 : -1; +} + +static int lazy_flush(BGZF *fp) +{ + if (fp->mt) { + if (fp->block_offset) mt_queue(fp); + return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp); + } + else return bgzf_flush(fp); +} + +#else // ~ #ifdef BGZF_MT + +int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) +{ + return 0; +} + +static inline int lazy_flush(BGZF *fp) +{ + return bgzf_flush(fp); +} + +#endif // ~ #ifdef BGZF_MT + +int bgzf_flush(BGZF *fp) +{ + if (!fp->is_write) return 0; +#ifdef BGZF_MT + if (fp->mt) { + if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail + return mt_flush_queue(fp); + } +#endif + while (fp->block_offset > 0) { + if ( fp->idx_build_otf ) + { + bgzf_index_add_block(fp); + fp->idx->ublock_addr += fp->block_offset; + } + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; + if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) { + fp->errcode |= BGZF_ERR_IO; // possibly truncated file + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int bgzf_flush_try(BGZF *fp, ssize_t size) +{ + if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp); + return 0; +} + +ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) +{ + if ( !fp->is_compressed ) + return hwrite(fp->fp, data, length); + + const uint8_t *input = (const uint8_t*)data; + ssize_t remaining = length; + assert(fp->is_write); + while (remaining > 0) { + uint8_t* buffer = (uint8_t*)fp->uncompressed_block; + int copy_length = BGZF_BLOCK_SIZE - fp->block_offset; + if (copy_length > remaining) copy_length = remaining; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + remaining -= copy_length; + if (fp->block_offset == BGZF_BLOCK_SIZE) { + if (lazy_flush(fp) != 0) return -1; + } + } + return length - remaining; +} + +ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) +{ + return hwrite(fp->fp, data, length); +} + +int bgzf_close(BGZF* fp) +{ + int ret, block_length; + if (fp == 0) return -1; + if (fp->is_write && fp->is_compressed) { + if (bgzf_flush(fp) != 0) return -1; + fp->compress_level = -1; + block_length = deflate_block(fp, 0); // write an empty block + if (hwrite(fp->fp, fp->compressed_block, block_length) < 0 + || hflush(fp->fp) != 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } +#ifdef BGZF_MT + if (fp->mt) mt_destroy(fp->mt); +#endif + } + if ( fp->is_gzip ) + { + (void)inflateEnd(fp->gz_stream); + free(fp->gz_stream); + } + ret = hclose(fp->fp); + if (ret != 0) return -1; + bgzf_index_destroy(fp); + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + uint8_t buf[28]; + off_t offset = htell(fp->fp); + if (hseek(fp->fp, -28, SEEK_END) < 0) { + if (errno == ESPIPE) { hclearerr(fp->fp); return 2; } + else return -1; + } + if ( hread(fp->fp, buf, 28) != 28 ) return -1; + if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1; + return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0; +} + +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + int block_offset; + int64_t block_address; + + if (fp->is_write || where != SEEK_SET) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + block_offset = pos & 0xFFFF; + block_address = pos >> 16; + if (hseek(fp->fp, block_address, SEEK_SET) < 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = block_address << 16; + fp->block_offset = block_offset; + return 0; +} + +int bgzf_is_bgzf(const char *fn) +{ + uint8_t buf[16]; + int n; + hFILE *fp; + if ((fp = hopen(fn, "r")) == 0) return 0; + n = hread(fp, buf, 16); + if ( hclose(fp) < 0 ) return -1; + if (n != 16) return 0; + return memcmp(g_magic, buf, 16) == 0? 1 : 0; +} + +int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { + fp->block_address = htell(fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + fp->uncompressed_address++; + return c; +} + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int bgzf_getline(BGZF *fp, int delim, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { + fp->block_address = htell(fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + fp->uncompressed_address += str->l; + str->s[str->l] = 0; + return str->l; +} + +void bgzf_index_destroy(BGZF *fp) +{ + if ( !fp->idx ) return; + free(fp->idx->offs); + free(fp->idx); + fp->idx = NULL; + fp->idx_build_otf = 0; +} + +int bgzf_index_build_init(BGZF *fp) +{ + bgzf_index_destroy(fp); + fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t)); + if ( !fp->idx ) return -1; + fp->idx_build_otf = 1; // build index on the fly + return 0; +} + +int bgzf_index_add_block(BGZF *fp) +{ + fp->idx->noffs++; + if ( fp->idx->noffs > fp->idx->moffs ) + { + fp->idx->moffs = fp->idx->noffs; + kroundup32(fp->idx->moffs); + fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t)); + if ( !fp->idx->offs ) return -1; + } + fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr; + fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address; + return 0; +} + +int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) +{ + if (bgzf_flush(fp) != 0) return -1; + + assert(fp->idx); + char *tmp = NULL; + if ( suffix ) + { + int blen = strlen(bname); + int slen = strlen(suffix); + tmp = (char*) malloc(blen + slen + 1); + if ( !tmp ) return -1; + memcpy(tmp,bname,blen); + memcpy(tmp+blen,suffix,slen+1); + } + + FILE *idx = fopen(tmp?tmp:bname,"wb"); + if ( tmp ) free(tmp); + if ( !idx ) return -1; + + // Note that the index contains one extra record when indexing files opened + // for reading. The terminating record is not present when opened for writing. + // This is not a bug. + + int i; + if ( fp->is_be ) + { + uint64_t x = fp->idx->noffs - 1; + fwrite(ed_swap_8p(&x), 1, sizeof(x), idx); + for (i=1; iidx->noffs; i++) + { + x = fp->idx->offs[i].caddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx); + x = fp->idx->offs[i].uaddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx); + } + } + else + { + uint64_t x = fp->idx->noffs - 1; + fwrite(&x, 1, sizeof(x), idx); + for (i=1; iidx->noffs; i++) + { + fwrite(&fp->idx->offs[i].caddr, 1, sizeof(fp->idx->offs[i].caddr), idx); + fwrite(&fp->idx->offs[i].uaddr, 1, sizeof(fp->idx->offs[i].uaddr), idx); + } + } + fclose(idx); + return 0; +} + + +int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) +{ + char *tmp = NULL; + if ( suffix ) + { + int blen = strlen(bname); + int slen = strlen(suffix); + tmp = (char*) malloc(blen + slen + 1); + if ( !tmp ) return -1; + memcpy(tmp,bname,blen); + memcpy(tmp+blen,suffix,slen+1); + } + + FILE *idx = fopen(tmp?tmp:bname,"rb"); + if ( tmp ) free(tmp); + if ( !idx ) return -1; + + fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t)); + uint64_t x; + if ( fread(&x, 1, sizeof(x), idx) != sizeof(x) ) return -1; + + fp->idx->noffs = fp->idx->moffs = 1 + (fp->is_be ? ed_swap_8(x) : x); + fp->idx->offs = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t)); + fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0; + + int i; + if ( fp->is_be ) + { + int ret = 0; + for (i=1; iidx->noffs; i++) + { + ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = ed_swap_8(x); + ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = ed_swap_8(x); + } + if ( ret != sizeof(x)*2*(fp->idx->noffs-1) ) return -1; + } + else + { + int ret = 0; + for (i=1; iidx->noffs; i++) + { + ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = x; + ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = x; + } + if ( ret != sizeof(x)*2*(fp->idx->noffs-1) ) return -1; + } + fclose(idx); + return 0; + +} + +int bgzf_useek(BGZF *fp, long uoffset, int where) +{ + if ( !fp->is_compressed ) + { + if (hseek(fp->fp, uoffset, SEEK_SET) < 0) + { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = uoffset; + fp->block_offset = 0; + bgzf_read_block(fp); + fp->uncompressed_address = uoffset; + return 0; + } + + if ( !fp->idx ) + { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + + // binary search + int ilo = 0, ihi = fp->idx->noffs - 1; + while ( ilo<=ihi ) + { + int i = (ilo+ihi)*0.5; + if ( uoffset < fp->idx->offs[i].uaddr ) ihi = i - 1; + else if ( uoffset >= fp->idx->offs[i].uaddr ) ilo = i + 1; + else break; + } + int i = ilo-1; + if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0) + { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = fp->idx->offs[i].caddr; + fp->block_offset = 0; + if ( bgzf_read_block(fp) < 0 ) return -1; + if ( uoffset - fp->idx->offs[i].uaddr > 0 ) + { + fp->block_offset = uoffset - fp->idx->offs[i].uaddr; + assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks + } + fp->uncompressed_address = uoffset; + return 0; +} + +long bgzf_utell(BGZF *fp) +{ + return fp->uncompressed_address; // currently maintained only when reading +} + diff --git a/star-sys/STAR/source/htslib/bgzip.c b/star-sys/STAR/source/htslib/bgzip.c new file mode 100644 index 0000000..bcd81f8 --- /dev/null +++ b/star-sys/STAR/source/htslib/bgzip.c @@ -0,0 +1,283 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/bgzf.h" +#include "htslib/hts.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + if ( scanf("%c", &c) != 1 ) c = 'n'; + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(EXIT_FAILURE); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(EXIT_FAILURE); + } + } + return fd; +} + +static int bgzip_main_usage(void) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Version: %s\n", hts_version()); + fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n"); + fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d, --decompress decompress\n"); + fprintf(stderr, " -f, --force overwrite files without asking\n"); + fprintf(stderr, " -h, --help give this help\n"); + fprintf(stderr, " -i, --index compress and create BGZF index\n"); + fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n"); + fprintf(stderr, " -r, --reindex (re)index compressed file\n"); + fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced, index = 0, reindex = 0; + BGZF *fp; + void *buffer; + long start, end, size; + char *index_fname = NULL; + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"offset",1,0,'b'}, + {"stdout",0,0,'c'}, + {"decompress",0,0,'d'}, + {"force",0,0,'f'}, + {"index",0,0,'i'}, + {"index-name",1,0,'I'}, + {"reindex",0,0,'r'}, + {"size",1,0,'s'}, + {0,0,0,0} + }; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){ + switch(c){ + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; + case 's': size = atol(optarg); pstdout = 1; break; + case 'f': is_forced = 1; break; + case 'i': index = 1; break; + case 'I': index_fname = optarg; break; + case 'r': reindex = 1; compress = 0; break; + case 'h': + case '?': return bgzip_main_usage(); + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if (pstdout) + f_dst = fileno(stdout); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + else if ( index && !index_fname ) + { + fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); + return 1; + } + + fp = bgzf_fdopen(f_dst, "w"); + if ( index ) bgzf_index_build_init(fp); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + // f_dst will be closed here + if ( index ) + { + if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL); + else bgzf_index_dump(fp, argv[optind], ".gz.gzi"); + } + if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); + if (argc > optind && !pstdout) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } + else if ( reindex ) + { + if ( argc>optind ) + { + fp = bgzf_open(argv[optind], "r"); + if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); + } + else + { + if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); + fp = bgzf_fdopen(fileno(stdin), "r"); + if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); + } + + buffer = malloc(BGZF_BLOCK_SIZE); + bgzf_index_build_init(fp); + int ret; + while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; + free(buffer); + if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); + + if ( index_fname ) + bgzf_index_dump(fp, index_fname, NULL); + else + bgzf_index_dump(fp, argv[optind], ".gzi"); + + if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); + return 0; + } + else + { + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + char *name; + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + + if (pstdout) { + f_dst = fileno(stdout); + } + else { + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + } + buffer = malloc(WINDOW_SIZE); + if ( start>0 ) + { + if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); + if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); + } + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode); + start += c; + if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); + if (!pstdout) unlink(argv[optind]); + return 0; + } + return 0; +} diff --git a/star-sys/STAR/source/htslib/config.h b/star-sys/STAR/source/htslib/config.h new file mode 100644 index 0000000..6d32bf5 --- /dev/null +++ b/star-sys/STAR/source/htslib/config.h @@ -0,0 +1,3 @@ +#define _USE_KNETFILE +#define BGZF_CACHE +#define BGZF_MT diff --git a/star-sys/STAR/source/htslib/cram/cram.h b/star-sys/STAR/source/htslib/cram/cram.h new file mode 100644 index 0000000..0b8b291 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram.h @@ -0,0 +1,66 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * CRAM interface. + * + * Consider using the higher level scram_*() API for programs that wish to + * be file format agnostic. + * + * This API should be used for CRAM specific code. The specifics of the + * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h + * although these should not be included directly (use this file instead). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef SAMTOOLS +# include "cram/cram_samtools.h" +#endif + +#ifndef _CRAM_H_ +#define _CRAM_H_ + +#include "cram/sam_header.h" +#include "cram_structs.h" +#include "cram_io.h" +#include "cram_encode.h" +#include "cram_decode.h" +#include "cram_stats.h" +#include "cram_codecs.h" +#include "cram_index.h" + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/star-sys/STAR/source/htslib/cram/cram_codecs.c b/star-sys/STAR/source/htslib/cram/cram_codecs.c new file mode 100644 index 0000000..3c3d13f --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_codecs.c @@ -0,0 +1,1764 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * FIXME: add checking of cram_external_type to return NULL on unsupported + * {codec,type} tuples. + */ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include + +#include "cram/cram.h" + +static char *codec2str(enum cram_encoding codec) { + switch (codec) { + case E_NULL: return "NULL"; + case E_EXTERNAL: return "EXTERNAL"; + case E_GOLOMB: return "GOLOMB"; + case E_HUFFMAN: return "HUFFMAN"; + case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN"; + case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP"; + case E_BETA: return "BETA"; + case E_SUBEXP: return "SUBEXP"; + case E_GOLOMB_RICE: return "GOLOMB_RICE"; + case E_GAMMA: return "GAMMA"; + } + + return "(unknown)"; +} + +/* + * --------------------------------------------------------------------------- + * Block bit-level I/O functions. + * All defined static here to promote easy inlining by the compiler. + */ + +#if 0 +/* Get a single bit, MSB first */ +static signed int get_bit_MSB(cram_block *block) { + unsigned int val; + + if (block->byte > block->alloc) + return -1; + + val = block->data[block->byte] >> block->bit; + if (--block->bit == -1) { + block->bit = 7; + block->byte++; + //printf("(%02X)", block->data[block->byte]); + } + + //printf("-B%d-", val&1); + + return val & 1; +} +#endif + +/* + * Count number of successive 0 and 1 bits + */ +static int get_one_bits_MSB(cram_block *block) { + int n = 0, b; + do { + b = block->data[block->byte] >> block->bit; + if (--block->bit == -1) { + block->bit = 7; + block->byte++; + } + n++; + } while (b&1); + + return n-1; +} + +static int get_zero_bits_MSB(cram_block *block) { + int n = 0, b; + do { + b = block->data[block->byte] >> block->bit; + if (--block->bit == -1) { + block->bit = 7; + block->byte++; + } + n++; + } while (!(b&1)); + + return n-1; +} + +#if 0 +/* Stores a single bit */ +static void store_bit_MSB(cram_block *block, unsigned int bit) { + if (block->byte >= block->alloc) { + block->alloc = block->alloc ? block->alloc*2 : 1024; + block->data = realloc(block->data, block->alloc); + } + + if (bit) + block->data[block->byte] |= (1 << block->bit); + + if (--block->bit == -1) { + block->bit = 7; + block->byte++; + block->data[block->byte] = 0; + } +} +#endif + +#if 0 +/* Rounds to the next whole byte boundary first */ +static void store_bytes_MSB(cram_block *block, char *bytes, int len) { + if (block->bit != 7) { + block->bit = 7; + block->byte++; + } + + while (block->byte + len >= block->alloc) { + block->alloc = block->alloc ? block->alloc*2 : 1024; + block->data = realloc(block->data, block->alloc); + } + + memcpy(&block->data[block->byte], bytes, len); + block->byte += len; +} +#endif + +/* Local optimised copy for inlining */ +static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { + unsigned int val = 0; + int i; + +#if 0 + // Fits within the current byte */ + if (nbits <= block->bit+1) { + val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits) == -1) { + block->bit = 7; + block->byte++; + } + return val; + } + + // partial first byte + val = block->data[block->byte] & ((1<<(block->bit+1))-1); + nbits -= block->bit+1; + block->bit = 7; + block->byte++; + + // whole middle bytes + while (nbits >= 8) { + val = (val << 8) | block->data[block->byte++]; + nbits -= 8; + } + + val <<= nbits; + val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits; + return val; +#endif + +#if 0 + /* Inefficient implementation! */ + //printf("{"); + for (i = 0; i < nbits; i++) + //val = (val << 1) | get_bit_MSB(block); + GET_BIT_MSB(block, val); +#endif + +#if 1 + /* Combination of 1st two methods */ + if (nbits <= block->bit+1) { + val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits) == -1) { + block->bit = 7; + block->byte++; + } + return val; + } + + switch(nbits) { +// case 15: GET_BIT_MSB(block, val); +// case 14: GET_BIT_MSB(block, val); +// case 13: GET_BIT_MSB(block, val); +// case 12: GET_BIT_MSB(block, val); +// case 11: GET_BIT_MSB(block, val); +// case 10: GET_BIT_MSB(block, val); +// case 9: GET_BIT_MSB(block, val); + case 8: GET_BIT_MSB(block, val); + case 7: GET_BIT_MSB(block, val); + case 6: GET_BIT_MSB(block, val); + case 5: GET_BIT_MSB(block, val); + case 4: GET_BIT_MSB(block, val); + case 3: GET_BIT_MSB(block, val); + case 2: GET_BIT_MSB(block, val); + case 1: GET_BIT_MSB(block, val); + break; + + default: + for (i = 0; i < nbits; i++) + //val = (val << 1) | get_bit_MSB(block); + GET_BIT_MSB(block, val); + } +#endif + + //printf("=0x%x}", val); + + return val; +} + +/* + * Can store up to 24-bits worth of data encoded in an integer value + * Possibly we'd want to have a less optimal store_bits function when dealing + * with nbits > 24, but for now we assume the codes generated are never + * that big. (Given this is only possible with 121392 or more + * characters with exactly the correct frequency distribution we check + * for it elsewhere.) + */ +static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { + /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */ + + /* + * Use slow mode until we tweak the huffman generator to never generate + * codes longer than 24-bits. + */ + unsigned int mask; + + if (block->byte+4 >= block->alloc) { + if (block->byte) { + block->alloc *= 2; + block->data = realloc(block->data, block->alloc + 4); + if (!block->data) + return -1; + } else { + block->alloc = 1024; + block->data = realloc(block->data, block->alloc + 4); + if (!block->data) + return -1; + block->data[0] = 0; // initialise first byte of buffer + } + } + + + + if (nbits <= block->bit+1) { + block->data[block->byte] |= (val << (block->bit+1-nbits)); + if ((block->bit-=nbits) == -1) { + block->bit = 7; + block->byte++; + block->data[block->byte] = 0; + } + return 0; + } + + block->data[block->byte] |= (val >> (nbits -= block->bit+1)); + block->bit = 7; + block->byte++; + block->data[block->byte] = 0; + + mask = 1<<(nbits-1); + do { + if (val & mask) + block->data[block->byte] |= (1 << block->bit); + if (--block->bit == -1) { + block->bit = 7; + block->byte++; + block->data[block->byte] = 0; + } + mask >>= 1; + } while(--nbits); + + return 0; +} + +/* + * Returns the next 'size' bytes from a block, or NULL if insufficient + * data left.This is just a pointer into the block data and not an + * allocated object, so do not free the result. + */ +static char *cram_extract_block(cram_block *b, int size) { + char *cp = (char *)b->data + b->idx; + b->idx += size; + if (b->idx > b->uncomp_size) + return NULL; + + return cp; +} + +/* + * --------------------------------------------------------------------------- + * EXTERNAL + */ +int cram_external_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i; + char *cp; + cram_block *b = NULL; + + /* Find the external block */ + if (slice->block_by_id) { + if (!(b = slice->block_by_id[c->external.content_id])) + return -1; + } else { + for (i = 0; i < slice->hdr->num_blocks; i++) { + b = slice->block[i]; + if (b->content_type == EXTERNAL && + b->content_id == c->external.content_id) { + break; + } + } + if (i == slice->hdr->num_blocks || !b) + return -1; + } + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + b->idx += itf8_get(cp, (int32_t *)out); + *out_size = 1; + + return 0; +} + +int cram_external_decode_char(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, + int *out_size) { + int i; + char *cp; + cram_block *b = NULL; + + /* Find the external block */ + if (slice->block_by_id) { + if (!(b = slice->block_by_id[c->external.content_id])) + return -1; + } else { + for (i = 0; i < slice->hdr->num_blocks; i++) { + b = slice->block[i]; + if (b->content_type == EXTERNAL && + b->content_id == c->external.content_id) { + break; + } + } + if (i == slice->hdr->num_blocks || !b) + return -1; + } + + cp = cram_extract_block(b, *out_size); + if (!cp) + return -1; + + memcpy(out, cp, *out_size); + return 0; +} + +int cram_external_decode_block(cram_slice *slice, cram_codec *c, + cram_block *in, char *out_, + int *out_size) { + int i; + char *cp; + cram_block *b = NULL; + cram_block *out = (cram_block *)out_; + + /* Find the external block */ + if (slice->block_by_id) { + if (!(b = slice->block_by_id[c->external.content_id])) + return -1; + } else { + for (i = 0; i < slice->hdr->num_blocks; i++) { + b = slice->block[i]; + if (b->content_type == EXTERNAL && + b->content_id == c->external.content_id) { + break; + } + } + if (i == slice->hdr->num_blocks || !b) + return -1; + } + + cp = cram_extract_block(b, *out_size); + if (!cp) + return -1; + + BLOCK_APPEND(out, cp, *out_size); + return 0; +} + +void cram_external_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_external_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_EXTERNAL; + if (option == E_INT || option == E_LONG) + c->decode = cram_external_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_external_decode_char; + else + c->decode = cram_external_decode_block; + c->free = cram_external_decode_free; + + cp += itf8_get(cp, &c->external.content_id); + + if (cp - data != size) { + fprintf(stderr, "Malformed external header stream\n"); + free(c); + return NULL; + } + + c->external.type = option; + + return c; +} + +int cram_external_encode(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + uint32_t *i32 = (uint32_t *)in; + + itf8_put_blk(out, *i32); + return 0; +} + +void cram_external_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += itf8_put(tp, c->e_external.content_id); + len += itf8_put_blk(b, c->codec); + len += itf8_put_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; +} + +cram_codec *cram_external_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + cram_codec *c; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_EXTERNAL; + c->free = cram_external_encode_free; + c->encode = cram_external_encode; + c->store = cram_external_encode_store; + + c->e_external.content_id = (size_t)dat; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * BETA + */ +int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n; + + if (c->beta.nbits) { + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; + } else { + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = 0; + } + + return 0; +} + +int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int i, n; + + if (c->beta.nbits) { + for (i = 0, n = *out_size; i < n; i++) + out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; + } else { + for (i = 0, n = *out_size; i < n; i++) + out[i] = 0; + } + + return 0; +} + +void cram_beta_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_beta_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BETA; + if (option == E_INT || option == E_LONG) + c->decode = cram_beta_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_beta_decode_char; + else + abort(); + c->free = cram_beta_decode_free; + + cp += itf8_get(cp, &c->beta.offset); + cp += itf8_get(cp, &c->beta.nbits); + + if (cp - data != size) { + fprintf(stderr, "Malformed beta header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_beta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + len += itf8_put_blk(b, c->codec); + len += itf8_put_blk(b, itf8_size(c->e_beta.offset) + + itf8_size(c->e_beta.nbits)); // codec length + len += itf8_put_blk(b, c->e_beta.offset); + len += itf8_put_blk(b, c->e_beta.nbits); + + return len; +} + +int cram_beta_encode_int(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits); + + return r; +} + +int cram_beta_encode_char(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + unsigned char *syms = (unsigned char *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(out, syms[i] + c->e_beta.offset, c->e_beta.nbits); + + return r; +} + +void cram_beta_encode_free(cram_codec *c) { + if (c) free(c); +} + +cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + cram_codec *c; + int min_val, max_val, len = 0; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BETA; + c->free = cram_beta_encode_free; + if (option == E_INT) + c->encode = cram_beta_encode_int; + else + c->encode = cram_beta_encode_char; + c->store = cram_beta_encode_store; + + if (dat) { + min_val = ((int *)dat)[0]; + max_val = ((int *)dat)[1]; + } else { + min_val = INT_MAX; + max_val = INT_MIN; + int i; + for (i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (min_val > i) + min_val = i; + max_val = i; + } + if (st->h) { + khint_t k; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + i = kh_key(st->h, k); + if (min_val > i) + min_val = i; + if (max_val < i) + max_val = i; + } + } + } + + assert(max_val >= min_val); + c->e_beta.offset = -min_val; + max_val -= min_val; + while (max_val) { + len++; + max_val >>= 1; + } + c->e_beta.nbits = len; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * SUBEXP + */ +int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int n, count; + int k = c->subexp.k; + + for (count = 0, n = *out_size; count < n; count++) { + int i = 0, tail; + int val; + + /* Get number of 1s */ + //while (get_bit_MSB(in) == 1) i++; + i = get_one_bits_MSB(in); + + /* + * Val is + * i > 0: 2^(k+i-1) + k+i-1 bits + * i = 0: k bits + */ + if (i) { + tail = i + k-1; + val = 0; + while (tail) { + //val = val<<1; val |= get_bit_MSB(in); + GET_BIT_MSB(in, val); + tail--; + } + val += 1 << (i + k-1); + } else { + tail = k; + val = 0; + while (tail) { + //val = val<<1; val |= get_bit_MSB(in); + GET_BIT_MSB(in, val); + tail--; + } + } + + out_i[count] = val - c->subexp.offset; + } + + return 0; +} + +void cram_subexp_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_subexp_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_SUBEXP; + c->decode = cram_subexp_decode; + c->free = cram_subexp_decode_free; + + cp += itf8_get(cp, &c->subexp.offset); + cp += itf8_get(cp, &c->subexp.k); + + if (cp - data != size) { + fprintf(stderr, "Malformed subexp header stream\n"); + free(c); + return NULL; + } + + return c; +} + +/* + * --------------------------------------------------------------------------- + * GAMMA + */ +int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) { + int nz = 0; + int val; + //while (get_bit_MSB(in) == 0) nz++; + nz = get_zero_bits_MSB(in); + val = 1; + while (nz > 0) { + //val <<= 1; val |= get_bit_MSB(in); + GET_BIT_MSB(in, val); + nz--; + } + + out_i[i] = val - c->gamma.offset; + } + + return 0; +} + +void cram_gamma_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_gamma_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_GAMMA; + c->decode = cram_gamma_decode; + c->free = cram_gamma_decode_free; + + cp += itf8_get(cp, &c->gamma.offset); + + if (cp - data != size) { + fprintf(stderr, "Malformed gamma header stream\n"); + free(c); + return NULL; + } + + return c; +} + +/* + * --------------------------------------------------------------------------- + * HUFFMAN + */ + +static int code_sort(const void *vp1, const void *vp2) { + const cram_huffman_code *c1 = (const cram_huffman_code *)vp1; + const cram_huffman_code *c2 = (const cram_huffman_code *)vp2; + + if (c1->len != c2->len) + return c1->len - c2->len; + else + return c1->symbol - c2->symbol; +} + +void cram_huffman_decode_free(cram_codec *c) { + if (!c) + return; + + if (c->huffman.codes) + free(c->huffman.codes); + free(c); +} + +int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i, n; + + /* Special case of 0 length codes */ + for (i = 0, n = *out_size; i < n; i++) { + out[i] = c->huffman.codes[0].symbol; + } + return 0; +} + +int cram_huffman_decode_char(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i, n, ncodes = c->huffman.ncodes; + const cram_huffman_code * const codes = c->huffman.codes; + + for (i = 0, n = *out_size; i < n; i++) { + int idx = 0; + int val = 0, len = 0, last_len = 0; + + for (;;) { + int dlen = codes[idx].len - last_len; + if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen) + return -1; + + //val <<= dlen; + //val |= get_bits_MSB(in, dlen); + //last_len = (len += dlen); + + last_len = (len += dlen); + for (; dlen; dlen--) GET_BIT_MSB(in, val); + + idx = val - codes[idx].p; + if (idx >= ncodes || idx < 0) + return -1; + + if (codes[idx].code == val && codes[idx].len == len) { + out[i] = codes[idx].symbol; + break; + } + } + } + + return 0; +} + +int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n; + const cram_huffman_code * const codes = c->huffman.codes; + + /* Special case of 0 length codes */ + for (i = 0, n = *out_size; i < n; i++) { + out_i[i] = codes[0].symbol; + } + return 0; +} + +int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n, ncodes = c->huffman.ncodes; + const cram_huffman_code * const codes = c->huffman.codes; + + for (i = 0, n = *out_size; i < n; i++) { + int idx = 0; + int val = 0, len = 0, last_len = 0; + + // Now one bit at a time for remaining checks + for (;;) { + int dlen = codes[idx].len - last_len; + if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen) + return -1; + + //val <<= dlen; + //val |= get_bits_MSB(in, dlen); + //last_len = (len += dlen); + + last_len = (len += dlen); + for (; dlen; dlen--) GET_BIT_MSB(in, val); + + idx = val - codes[idx].p; + if (idx >= ncodes || idx < 0) + return -1; + + if (codes[idx].code == val && codes[idx].len == len) { + out_i[i] = codes[idx].symbol; + break; + } + } + } + + return 0; +} + +/* + * Initialises a huffman decoder from an encoding data stream. + */ +cram_codec *cram_huffman_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + int32_t ncodes, i, j; + char *cp = data, *data_end = &data[size]; + cram_codec *h; + cram_huffman_code *codes; + int32_t val, last_len, max_len = 0; + + cp += itf8_get(cp, &ncodes); + h = calloc(1, sizeof(*h)); + if (!h) + return NULL; + + h->free = cram_huffman_decode_free; + + h->huffman.ncodes = ncodes; + codes = h->huffman.codes = malloc(ncodes * sizeof(*codes)); + if (!codes) { + free(h); + return NULL; + } + + /* Read symbols and bit-lengths */ + for (i = 0; i < ncodes && cp < data_end; i++) { + cp += itf8_get(cp, &codes[i].symbol); + } + + if (cp >= data_end) { + fprintf(stderr, "Malformed huffman header stream\n"); + free(h); + return NULL; + } + cp += itf8_get(cp, &i); + if (i != ncodes) { + fprintf(stderr, "Malformed huffman header stream\n"); + free(h); + return NULL; + } + + if (ncodes == 0) { + /* NULL huffman stream */ + return h; + } + + for (i = 0; i < ncodes && cp < data_end; i++) { + cp += itf8_get(cp, &codes[i].len); + if (max_len < codes[i].len) + max_len = codes[i].len; + } + if (cp - data != size || max_len >= ncodes) { + fprintf(stderr, "Malformed huffman header stream\n"); + free(h); + return NULL; + } + + /* Sort by bit length and then by symbol value */ + qsort(codes, ncodes, sizeof(*codes), code_sort); + + /* Assign canonical codes */ + val = -1, last_len = 0; + for (i = 0; i < ncodes; i++) { + val++; + if (codes[i].len > last_len) { + while (codes[i].len > last_len) { + val <<= 1; + last_len++; + } + } + codes[i].code = val; + } + + /* + * Compute the next starting point, offset by the i'th value. + * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then + * codes[10..13].p = 30 - 10. + */ + last_len = 0; + for (i = j = 0; i < ncodes; i++) { + if (codes[i].len > last_len) { + j = codes[i].code - i; + last_len = codes[i].len; + } + codes[i].p = j; + } + +// puts("==HUFF LEN=="); +// for (i = 0; i <= last_len+1; i++) { +// printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]); +// } +// puts("===HUFFMAN CODES==="); +// for (i = 0; i < ncodes; i++) { +// int j; +// printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code); +// j = codes[i].len; +// while (j) { +// putchar(codes[i].code & (1 << --j) ? '1' : '0'); +// } +// printf(" %d\n", codes[i].code); +// } + + h->codec = E_HUFFMAN; + if (option == E_BYTE || option == E_BYTE_ARRAY) { + if (h->huffman.codes[0].len == 0) + h->decode = cram_huffman_decode_char0; + else + h->decode = cram_huffman_decode_char; + } else if (option == E_BYTE_ARRAY_BLOCK) { + abort(); + } else { + if (h->huffman.codes[0].len == 0) + h->decode = cram_huffman_decode_int0; + else + h->decode = cram_huffman_decode_int; + } + + return (cram_codec *)h; +} + +int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + return 0; +} + +int cram_huffman_encode_char(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + int i, code, len, r = 0; + unsigned char *syms = (unsigned char *)in; + + do { + int sym = *syms++; + if (sym >= -1 && sym < MAX_HUFF) { + i = c->e_huffman.val2code[sym+1]; + assert(c->e_huffman.codes[i].symbol == sym); + code = c->e_huffman.codes[i].code; + len = c->e_huffman.codes[i].len; + } else { + /* Slow - use a lookup table for when sym < MAX_HUFF? */ + for (i = 0; i < c->e_huffman.nvals; i++) { + if (c->e_huffman.codes[i].symbol == sym) + break; + } + if (i == c->e_huffman.nvals) + return -1; + + code = c->e_huffman.codes[i].code; + len = c->e_huffman.codes[i].len; + } + + r |= store_bits_MSB(out, code, len); + } while (--in_size); + + return r; +} + +int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + return 0; +} + +int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + int i, code, len, r = 0; + int *syms = (int *)in; + + do { + int sym = *syms++; + + if (sym >= -1 && sym < MAX_HUFF) { + i = c->e_huffman.val2code[sym+1]; + assert(c->e_huffman.codes[i].symbol == sym); + code = c->e_huffman.codes[i].code; + len = c->e_huffman.codes[i].len; + } else { + /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ + for (i = 0; i < c->e_huffman.nvals; i++) { + if (c->e_huffman.codes[i].symbol == sym) + break; + } + if (i == c->e_huffman.nvals) + return -1; + + code = c->e_huffman.codes[i].code; + len = c->e_huffman.codes[i].len; + } + + r |= store_bits_MSB(out, code, len); + } while (--in_size); + + return r; +} + +void cram_huffman_encode_free(cram_codec *c) { + if (!c) + return; + + if (c->e_huffman.codes) + free(c->e_huffman.codes); + free(c); +} + +/* + * Encodes a huffman tree. + * Returns number of bytes written. + */ +int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + int i, len = 0; + cram_huffman_code *codes = c->e_huffman.codes; + /* + * Up to code length 127 means 2.5e+26 bytes of data required (worst + * case huffman tree needs symbols with freqs matching the Fibonacci + * series). So guaranteed 1 byte per code. + * + * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8). + * + * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory + */ + char *tmp = malloc(6*c->e_huffman.nvals+16); + char *tp = tmp; + + if (!tmp) + return -1; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += itf8_put(tp, c->e_huffman.nvals); + for (i = 0; i < c->e_huffman.nvals; i++) { + tp += itf8_put(tp, codes[i].symbol); + } + + tp += itf8_put(tp, c->e_huffman.nvals); + for (i = 0; i < c->e_huffman.nvals; i++) { + tp += itf8_put(tp, codes[i].len); + } + + len += itf8_put_blk(b, c->codec); + len += itf8_put_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + free(tmp); + + return len; +} + +cram_codec *cram_huffman_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len; + int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + cram_codec *c; + cram_huffman_code *codes; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_HUFFMAN; + + /* Count number of unique symbols */ + for (nvals = i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (nvals >= vals_alloc) { + vals_alloc = vals_alloc ? vals_alloc*2 : 1024; + vals = realloc(vals, vals_alloc * sizeof(int)); + freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!vals || !freqs) { + if (vals) free(vals); + if (freqs) free(freqs); + free(c); + return NULL; + } + } + vals[nvals] = i; + freqs[nvals] = st->freqs[i]; + assert(st->freqs[i] > 0); + ntot += freqs[nvals]; + if (max_val < i) max_val = i; + if (min_val > i) min_val = i; + nvals++; + } + if (st->h) { + khint_t k; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + if (nvals >= vals_alloc) { + vals_alloc = vals_alloc ? vals_alloc*2 : 1024; + vals = realloc(vals, vals_alloc * sizeof(int)); + freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!vals || !freqs) + return NULL; + } + vals[nvals]= kh_key(st->h, k); + freqs[nvals] = kh_val(st->h, k); + assert(freqs[nvals] > 0); + ntot += freqs[nvals]; + if (max_val < i) max_val = i; + if (min_val > i) min_val = i; + nvals++; + } + } + + assert(nvals > 0); + + freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); + lens = calloc(2*nvals, sizeof(*lens)); + if (!lens || !freqs) + return NULL; + + /* Inefficient, use pointers to form chain so we can insert and maintain + * a sorted list? This is currently O(nvals^2) complexity. + */ + for (;;) { + int low1 = INT_MAX, low2 = INT_MAX; + int ind1 = 0, ind2 = 0; + for (i = 0; i < nvals; i++) { + if (freqs[i] < 0) + continue; + if (low1 > freqs[i]) + low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i; + else if (low2 > freqs[i]) + low2 = freqs[i], ind2 = i; + } + if (low2 == INT_MAX) + break; + + freqs[nvals] = low1 + low2; + lens[ind1] = nvals; + lens[ind2] = nvals; + freqs[ind1] *= -1; + freqs[ind2] *= -1; + nvals++; + } + nvals = nvals/2+1; + + /* Assign lengths */ + for (i = 0; i < nvals; i++) { + int code_len = 0; + for (k = lens[i]; k; k = lens[k]) + code_len++; + lens[i] = code_len; + freqs[i] *= -1; + //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]); + } + + + /* Sort, need in a struct */ + if (!(codes = malloc(nvals * sizeof(*codes)))) + return NULL; + for (i = 0; i < nvals; i++) { + codes[i].symbol = vals[i]; + codes[i].len = lens[i]; + } + qsort(codes, nvals, sizeof(*codes), code_sort); + + /* + * Generate canonical codes from lengths. + * Sort by length. + * Start with 0. + * Every new code of same length is +1. + * Every new code of new length is +1 then <<1 per extra length. + * + * /\ + * a/\ + * /\/\ + * bcd/\ + * ef + * + * a 1 0 + * b 3 4 (0+1)<<2 + * c 3 5 + * d 3 6 + * e 4 14 (6+1)<<1 + * f 5 15 + */ + code = 0; len = codes[0].len; + for (i = 0; i < nvals; i++) { + while (len != codes[i].len) { + code<<=1; + len++; + } + codes[i].code = code++; + + if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF) + c->e_huffman.val2code[codes[i].symbol+1] = i; + + //fprintf(stderr, "sym %d, code %d, len %d\n", + // codes[i].symbol, codes[i].code, codes[i].len); + } + + free(lens); + free(vals); + free(freqs); + + c->e_huffman.codes = codes; + c->e_huffman.nvals = nvals; + + c->free = cram_huffman_encode_free; + if (option == E_BYTE || option == E_BYTE_ARRAY) { + if (c->e_huffman.codes[0].len == 0) + c->encode = cram_huffman_encode_char0; + else + c->encode = cram_huffman_encode_char; + } else { + if (c->e_huffman.codes[0].len == 0) + c->encode = cram_huffman_encode_int0; + else + c->encode = cram_huffman_encode_int; + } + c->store = cram_huffman_encode_store; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * BYTE_ARRAY_LEN + */ +int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, + int *out_size) { + /* Fetch length */ + int32_t len, one = 1; + + c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec, in, (char *)&len, &one); + //printf("ByteArray Len=%d\n", len); + + if (c->byte_array_len.value_codec) { + c->byte_array_len.value_codec->decode(slice, + c->byte_array_len.value_codec, + in, out, &len); + } else { + return -1; + } + + *out_size = len; + + return 0; +} + +void cram_byte_array_len_decode_free(cram_codec *c) { + if (!c) return; + + if (c->byte_array_len.len_codec) + c->byte_array_len.len_codec->free(c->byte_array_len.len_codec); + + if (c->byte_array_len.value_codec) + c->byte_array_len.value_codec->free(c->byte_array_len.value_codec); + + free(c); +} + +cram_codec *cram_byte_array_len_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + char *cp = data; + int32_t encoding; + int32_t sub_size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BYTE_ARRAY_LEN; + c->decode = cram_byte_array_len_decode; + c->free = cram_byte_array_len_decode_free; + + cp += itf8_get(cp, &encoding); + cp += itf8_get(cp, &sub_size); + c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, + E_INT, version); + cp += sub_size; + + cp += itf8_get(cp, &encoding); + cp += itf8_get(cp, &sub_size); + c->byte_array_len.value_codec = cram_decoder_init(encoding, cp, sub_size, + option, version); + cp += sub_size; + + if (cp - data != size) { + fprintf(stderr, "Malformed byte_array_len header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + return -1; // not imp. +} + +void cram_byte_array_len_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + len += itf8_put_blk(b, c->codec); + len += itf8_put_blk(b, c->e_byte_array_len.len_len + + c->e_byte_array_len.val_len); + BLOCK_APPEND(b, c->e_byte_array_len.len_dat, c->e_byte_array_len.len_len); + len += c->e_byte_array_len.len_len; + + BLOCK_APPEND(b, c->e_byte_array_len.val_dat, c->e_byte_array_len.val_len); + len += c->e_byte_array_len.val_len; + + return len; +} + +cram_codec *cram_byte_array_len_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + cram_codec *c; + cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BYTE_ARRAY_LEN; + c->free = cram_byte_array_len_encode_free; + c->encode = cram_byte_array_len_encode; + c->store = cram_byte_array_len_encode_store; + + c->e_byte_array_len.len_len = e->len_len; + c->e_byte_array_len.len_dat = e->len_dat; + c->e_byte_array_len.val_len = e->val_len; + c->e_byte_array_len.val_dat = e->val_dat; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * BYTE_ARRAY_STOP + */ +int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, + int *out_size) { + int i; + cram_block *b = NULL; + char *cp, ch; + + if (slice->block_by_id) { + if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) + return -1; + } else { + for (i = 0; i < slice->hdr->num_blocks; i++) { + b = slice->block[i]; + if (b->content_type == EXTERNAL && + b->content_id == c->byte_array_stop.content_id) { + break; + } + } + if (i == slice->hdr->num_blocks || !b) + return -1; + } + + if (b->idx >= b->uncomp_size) + return -1; + + cp = (char *)b->data + b->idx; + while ((ch = *cp) != (char)c->byte_array_stop.stop) { + if (cp - (char *)b->data >= b->uncomp_size) + return -1; + *out++ = ch; + cp++; + } + + *out_size = cp - (char *)(b->data + b->idx); + b->idx = cp - (char *)b->data + 1; + + return 0; +} + +int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, + cram_block *in, char *out_, + int *out_size) { + int space = 256; + cram_block *b = NULL; + cram_block *out = (cram_block *)out_; + char *cp, ch, *out_cp, *cp_end, *out_end; + char stop; + + if (slice->block_by_id) { + if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) + return -1; + } else { + int i; + for (i = 0; i < slice->hdr->num_blocks; i++) { + b = slice->block[i]; + if (b->content_type == EXTERNAL && + b->content_id == c->byte_array_stop.content_id) { + break; + } + } + if (i == slice->hdr->num_blocks || !b) + return -1; + } + + if (b->idx >= b->uncomp_size) + return -1; + cp = (char *)b->data + b->idx; + cp_end = (char *)b->data + b->uncomp_size; + BLOCK_GROW(out, space); + out_cp = (char *)BLOCK_END(out); + out_end = out_cp + space; + + stop = c->byte_array_stop.stop; + while ((ch = *cp) != stop) { + if (cp++ == cp_end) + return -1; + *out_cp++ = ch; + + if (out_cp == out_end) { + BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + space *= 2; + BLOCK_GROW(out, space); + out_cp = (char *)BLOCK_END(out); + out_end = out_cp + space; + } + } + BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + + *out_size = cp - (char *)(b->data + b->idx); + b->idx = cp - (char *)b->data + 1; + + return 0; +} + +void cram_byte_array_stop_decode_free(cram_codec *c) { + if (!c) return; + + free(c); +} + +cram_codec *cram_byte_array_stop_decode_init(char *data, int size, + enum cram_external_type option, + int version) { + cram_codec *c; + unsigned char *cp = (unsigned char *)data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BYTE_ARRAY_STOP; + c->decode = (option == E_BYTE_ARRAY_BLOCK) + ? cram_byte_array_stop_decode_block + : cram_byte_array_stop_decode_char; + c->free = cram_byte_array_stop_decode_free; + + c->byte_array_stop.stop = *cp++; + if (version == CRAM_1_VERS) { + c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16) + + (cp[3]<<24); + cp += 4; + } else { + cp += itf8_get(cp, &c->byte_array_stop.content_id); + } + + if ((char *)cp - data != size) { + fprintf(stderr, "Malformed byte_array_stop header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c, + cram_block *out, char *in, int in_size) { + return -1; // not imp. +} + +void cram_byte_array_stop_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0; + char buf[20], *cp = buf; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + cp += itf8_put(cp, c->codec); + + if (version == CRAM_1_VERS) { + cp += itf8_put(cp, 5); + *cp++ = c->e_byte_array_stop.stop; + *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff; + *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff; + *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff; + *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff; + } else { + cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id)); + *cp++ = c->e_byte_array_stop.stop; + cp += itf8_put(cp, c->e_byte_array_stop.content_id); + } + + BLOCK_APPEND(b, buf, cp-buf); + len += cp-buf; + + return len; +} + +cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + cram_codec *c; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BYTE_ARRAY_STOP; + c->free = cram_byte_array_stop_encode_free; + c->encode = cram_byte_array_stop_encode; + c->store = cram_byte_array_stop_encode_store; + + c->e_byte_array_stop.stop = ((int *)dat)[0]; + c->e_byte_array_stop.content_id = ((int *)dat)[1]; + + return c; +} + +/* + * --------------------------------------------------------------------------- + */ + +char *cram_encoding2str(enum cram_encoding t) { + switch (t) { + case E_NULL: return "NULL"; + case E_EXTERNAL: return "EXTERNAL"; + case E_GOLOMB: return "GOLOMB"; + case E_HUFFMAN: return "HUFFMAN"; + case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN"; + case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP"; + case E_BETA: return "BETA"; + case E_SUBEXP: return "SUBEXP"; + case E_GOLOMB_RICE: return "GOLOMB_RICE"; + case E_GAMMA: return "GAMMA"; + } + return "?"; +} + +static cram_codec *(*decode_init[])(char *data, + int size, + enum cram_external_type option, + int version) = { + NULL, + cram_external_decode_init, + NULL, + cram_huffman_decode_init, + cram_byte_array_len_decode_init, + cram_byte_array_stop_decode_init, + cram_beta_decode_init, + cram_subexp_decode_init, + NULL, + cram_gamma_decode_init, +}; + +cram_codec *cram_decoder_init(enum cram_encoding codec, + char *data, int size, + enum cram_external_type option, + int version) { + if (decode_init[codec]) { + return decode_init[codec](data, size, option, version); + } else { + fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec)); + return NULL; + } +} + +static cram_codec *(*encode_init[])(cram_stats *stx, + enum cram_external_type option, + void *opt, + int version) = { + NULL, + cram_external_encode_init, + NULL, + cram_huffman_encode_init, + cram_byte_array_len_encode_init, + cram_byte_array_stop_encode_init, + cram_beta_encode_init, + NULL, //cram_subexp_encode_init, + NULL, + NULL, //cram_gamma_encode_init, +}; + +cram_codec *cram_encoder_init(enum cram_encoding codec, + cram_stats *st, + enum cram_external_type option, + void *dat, + int version) { + if (st && !st->nvals) + return NULL; + + if (encode_init[codec]) { + return encode_init[codec](st, option, dat, version); + } else { + fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec)); + abort(); + } +} diff --git a/star-sys/STAR/source/htslib/cram/cram_codecs.h b/star-sys/STAR/source/htslib/cram/cram_codecs.h new file mode 100644 index 0000000..7037814 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_codecs.h @@ -0,0 +1,155 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_ENCODINGS_H_ +#define _CRAM_ENCODINGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +struct cram_codec; + +/* + * Slow but simple huffman decoder to start with. + * Read a bit at a time, keeping track of {length, value} + * eg. 1 1 0 1 => {1,1}, {2,3}, {3,6}, {4,13} + * + * Keep track of this through the huffman code table. + * For fast scanning we have an index of where the first code of length X + * appears. + */ +typedef struct { + int32_t symbol; + int32_t p; // next code start value, minus index to codes[] + int32_t code; + int32_t len; +} cram_huffman_code; + +typedef struct { + int ncodes; + cram_huffman_code *codes; +} cram_huffman_decoder; + +#define MAX_HUFF 128 +typedef struct { + cram_huffman_code *codes; + int nvals; + int val2code[MAX_HUFF+1]; // value to code lookup for small values +} cram_huffman_encoder; + +typedef struct { + int32_t offset; + int32_t nbits; +} cram_beta_decoder; + +typedef struct { + int32_t offset; +} cram_gamma_decoder; + +typedef struct { + int32_t offset; + int32_t k; +} cram_subexp_decoder; + +typedef struct { + int32_t content_id; + enum cram_external_type type; +} cram_external_decoder; + +typedef struct { + struct cram_codec *len_codec; + struct cram_codec *value_codec; +} cram_byte_array_len_decoder; + +typedef struct { + unsigned char stop; + int32_t content_id; +} cram_byte_array_stop_decoder; + +typedef struct { + uint32_t len_len; + unsigned char *len_dat; + uint32_t val_len; + unsigned char *val_dat; +} cram_byte_array_len_encoder; + +/* + * A generic codec structure. + */ +typedef struct cram_codec { + enum cram_encoding codec; + void (*free)(struct cram_codec *codec); + int (*decode)(cram_slice *slice, struct cram_codec *codec, + cram_block *in, char *out, int *out_size); + int (*encode)(cram_slice *slice, struct cram_codec *codec, + cram_block *out, char *in, int in_size); + int (*store)(struct cram_codec *codec, cram_block *b, char *prefix, + int version); + union { + cram_huffman_decoder huffman; + cram_external_decoder external; + cram_beta_decoder beta; + cram_gamma_decoder gamma; + cram_subexp_decoder subexp; + cram_byte_array_len_decoder byte_array_len; + cram_byte_array_stop_decoder byte_array_stop; + + cram_huffman_encoder e_huffman; + cram_external_decoder e_external; + cram_byte_array_stop_decoder e_byte_array_stop; + cram_byte_array_len_encoder e_byte_array_len; + cram_beta_decoder e_beta; + }; +} cram_codec; + +char *cram_encoding2str(enum cram_encoding t); + +cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size, + enum cram_external_type option, + int version); +cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, + enum cram_external_type option, void *dat, + int version); + +//int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size); +//void cram_decoder_free(void *codes); + +//#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, (--b->bit == -1) && (b->bit = 7, b->byte++)) + +#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (b->bit==0), b->bit+=(b->bit==0)*8-1) + +#ifdef __cplusplus +} +#endif + +#endif /* _CRAM_ENCODINGS_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/cram_decode.c b/star-sys/STAR/source/htslib/cram/cram_decode.c new file mode 100644 index 0000000..83fdaee --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_decode.c @@ -0,0 +1,2138 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * - In-memory decoding of CRAM data structures. + * - Iterator for reading CRAM record by record. + */ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cram/cram.h" +#include "cram/os.h" +#include "cram/md5.h" + +//Whether CIGAR has just M or uses = and X to indicate match and mismatch +//#define USE_X + +/* ---------------------------------------------------------------------- + * CRAM compression headers + */ + +/* + * Decodes the Tag Dictionary record in the preservation map + * Updates the cram compression header. + * + * Returns number of bytes decoded on success + * -1 on failure + */ +int cram_decode_TD(char *cp, cram_block_compression_hdr *h) { + char *op = cp; + unsigned char *dat; + cram_block *b; + int32_t blk_size; + int nTL, i, sz; + + if (!(b = cram_new_block(0, 0))) + return -1; + h->TD_blk = b; + + /* Decode */ + cp += itf8_get(cp, &blk_size); + if (!blk_size) { + h->nTL = 0; + h->TL = NULL; + cram_free_block(b); + return cp - op; + } + + BLOCK_APPEND(b, cp, blk_size); + cp += blk_size; + sz = cp - op; + + // Force nul termination if missing + if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1]) + BLOCK_APPEND_CHAR(b, '\0'); + + /* Set up TL lookup table */ + dat = BLOCK_DATA(b); + + // Count + for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { + nTL++; + while (dat[i]) + i++; + } + + // Copy + h->nTL = nTL; + if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) + return -1; + for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { + h->TL[nTL++] = &dat[i]; + while (dat[i]) + i++; + } + + return sz; +} + +/* + * Decodes a CRAM block compression header. + * Returns header ptr on success + * NULL on failure + */ +cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, + cram_block *b) { + char *cp, *cp_copy; + cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); + int i; + int32_t map_size, map_count; + + if (!hdr) + return NULL; + + if (b->method != RAW) { + if (cram_uncompress_block(b)) + return NULL; + } + + cp = (char *)b->data; + + if (fd->version == CRAM_1_VERS) { + cp += itf8_get(cp, &hdr->ref_seq_id); + cp += itf8_get(cp, &hdr->ref_seq_start); + cp += itf8_get(cp, &hdr->ref_seq_span); + cp += itf8_get(cp, &hdr->num_records); + cp += itf8_get(cp, &hdr->num_landmarks); + if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) { + free(hdr); + return NULL; + } + for (i = 0; i < hdr->num_landmarks; i++) { + cp += itf8_get(cp, &hdr->landmark[i]); + } + } + + hdr->preservation_map = kh_init(map); + + memset(hdr->rec_encoding_map, 0, + CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0])); + memset(hdr->tag_encoding_map, 0, + CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0])); + + if (!hdr->preservation_map) { + cram_free_compression_header(hdr); + return NULL; + } + + /* Initialise defaults for preservation map */ + hdr->mapped_qs_included = 0; + hdr->unmapped_qs_included = 0; + hdr->unmapped_placed = 0; + hdr->qs_included = 0; + hdr->read_names_included = 0; + hdr->AP_delta = 1; + memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); + + /* Preservation map */ + cp += itf8_get(cp, &map_size); cp_copy = cp; + cp += itf8_get(cp, &map_count); + for (i = 0; i < map_count; i++) { + pmap_t hd; + khint_t k; + int r; + + cp += 2; + switch(CRAM_KEY(cp[-2],cp[-1])) { + case CRAM_KEY('M','I'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "MI", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->mapped_qs_included = hd.i; + break; + + case CRAM_KEY('U','I'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "UI", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->unmapped_qs_included = hd.i; + break; + + case CRAM_KEY('P','I'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "PI", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->unmapped_placed = hd.i; + break; + + case CRAM_KEY('R','N'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "RN", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->read_names_included = hd.i; + break; + + case CRAM_KEY('A','P'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "AP", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->AP_delta = hd.i; + break; + + case CRAM_KEY('R','R'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "RR", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + fd->no_ref = !hd.i; + break; + + case CRAM_KEY('S','M'): + hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C'; + hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G'; + hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T'; + hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N'; + + hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A'; + hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G'; + hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T'; + hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N'; + + hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A'; + hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C'; + hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T'; + hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N'; + + hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A'; + hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C'; + hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G'; + hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N'; + + hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A'; + hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C'; + hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G'; + hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T'; + + hd.p = cp; + cp += 5; + + k = kh_put(map, hdr->preservation_map, "SM", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + kh_val(hdr->preservation_map, k) = hd; + break; + + case CRAM_KEY('T','D'): { + int sz = cram_decode_TD(cp, hdr); // tag dictionary + if (sz < 0) { + cram_free_compression_header(hdr); + return NULL; + } + + hd.p = cp; + cp += sz; + + k = kh_put(map, hdr->preservation_map, "TD", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + kh_val(hdr->preservation_map, k) = hd; + break; + } + + default: + fprintf(stderr, "Unrecognised preservation map key %c%c\n", + cp[-2], cp[-1]); + // guess byte; + cp++; + break; + } + } + if (cp - cp_copy != map_size) { + cram_free_compression_header(hdr); + return NULL; + } + + /* Record encoding map */ + cp += itf8_get(cp, &map_size); cp_copy = cp; + cp += itf8_get(cp, &map_count); + for (i = 0; i < map_count; i++) { + char *key = cp; + int32_t encoding; + int32_t size; + cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc + + if (!m) { + cram_free_compression_header(hdr); + return NULL; + } + + cp += 2; + cp += itf8_get(cp, &encoding); + cp += itf8_get(cp, &size); + + // Fill out cram_map purely for cram_dump to dump out. + m->key = (key[0]<<8)|key[1]; + m->encoding = encoding; + m->size = size; + m->offset = cp - (char *)b->data; + m->codec = NULL; + + if (m->encoding == E_NULL) + continue; + + //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key); + + /* + * For CRAM1.0 CF and BF are Byte and not Int. + * Practically speaking it makes no difference unless we have a + * 1.0 format file that stores these in EXTERNAL as only then + * does Byte vs Int matter. + * + * Neither this C code nor Java reference implementations did this, + * so we gloss over it and treat them as int. + */ + + if (key[0] == 'B' && key[1] == 'F') { + if (!(hdr->BF_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'C' && key[1] == 'F') { + if (!(hdr->CF_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'R' && key[1] == 'I') { + if (!(hdr->RI_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'R' && key[1] == 'L') { + if (!(hdr->RL_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'A' && key[1] == 'P') { + if (!(hdr->AP_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'R' && key[1] == 'G') { + if (!(hdr->RG_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'M' && key[1] == 'F') { + if (!(hdr->MF_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'N' && key[1] == 'S') { + if (!(hdr->NS_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'N' && key[1] == 'P') { + if (!(hdr->NP_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'T' && key[1] == 'S') { + if (!(hdr->TS_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'N' && key[1] == 'F') { + if (!(hdr->NF_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'T' && key[1] == 'C') { + if (!(hdr->TC_codec = cram_decoder_init(encoding, cp, size, E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'T' && key[1] == 'N') { + if (!(hdr->TN_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'F' && key[1] == 'N') { + if (!(hdr->FN_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'F' && key[1] == 'C') { + if (!(hdr->FC_codec = cram_decoder_init(encoding, cp, size, E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'F' && key[1] == 'P') { + if (!(hdr->FP_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'B' && key[1] == 'S') { + if (!(hdr->BS_codec = cram_decoder_init(encoding, cp, size, E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'I' && key[1] == 'N') { + if (!(hdr->IN_codec = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'S' && key[1] == 'C') { + if (!(hdr->SC_codec = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'D' && key[1] == 'L') { + if (!(hdr->DL_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'B' && key[1] == 'A') { + if (!(hdr->BA_codec = cram_decoder_init(encoding, cp, size, E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'R' && key[1] == 'S') { + if (!(hdr->RS_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'P' && key[1] == 'D') { + if (!(hdr->PD_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'H' && key[1] == 'C') { + if (!(hdr->HC_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'M' && key[1] == 'Q') { + if (!(hdr->MQ_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'R' && key[1] == 'N') { + if (!(hdr->RN_codec = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY_BLOCK, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'Q' && key[1] == 'S') { + if (!(hdr->QS_codec = cram_decoder_init(encoding, cp, size, E_BYTE, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + if (!(hdr->Qs_codec = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'T' && key[1] == 'L') { + if (!(hdr->TL_codec = cram_decoder_init(encoding, cp, size, E_INT, + fd->version))) { + cram_free_compression_header(hdr); + return NULL; + } + } else if (key[0] == 'T' && key[1] == 'M') { + } else if (key[0] == 'T' && key[1] == 'V') { + } else + fprintf(stderr, "Unrecognised key: %.2s\n", key); + + cp += size; + + m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])]; + hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m; + } + if (cp - cp_copy != map_size) { + cram_free_compression_header(hdr); + return NULL; + } + + /* Tag encoding map */ + cp += itf8_get(cp, &map_size); cp_copy = cp; + cp += itf8_get(cp, &map_count); + for (i = 0; i < map_count; i++) { + int32_t encoding; + int32_t size; + cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc + char *key = cp+1; + + if (!m) { + cram_free_compression_header(hdr); + return NULL; + } + + m->key = (key[0]<<16)|(key[1]<<8)|key[2]; + + cp += 4; // Strictly ITF8, but this suffices + cp += itf8_get(cp, &encoding); + cp += itf8_get(cp, &size); + + m->encoding = encoding; + m->size = size; + m->offset = cp - (char *)b->data; + if (!(m->codec = cram_decoder_init(encoding, cp, size, + E_BYTE_ARRAY_BLOCK, fd->version))) { + cram_free_compression_header(hdr); + free(m); + return NULL; + } + + cp += size; + + m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; + hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; + } + if (cp - cp_copy != map_size) { + cram_free_compression_header(hdr); + return NULL; + } + + return hdr; +} + +/* ---------------------------------------------------------------------- + * CRAM slices + */ + +/* + * Decodes a CRAM (un)mapped slice header block. + * Returns slice header ptr on success + * NULL on failure + */ +cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { + cram_block_slice_hdr *hdr; + char *cp = (char *)b->data; + int i; + + if (b->content_type != MAPPED_SLICE && + b->content_type != UNMAPPED_SLICE) + return NULL; + + if (!(hdr = calloc(1, sizeof(*hdr)))) + return NULL; + + hdr->content_type = b->content_type; + + if (b->content_type == MAPPED_SLICE) { + cp += itf8_get(cp, &hdr->ref_seq_id); + cp += itf8_get(cp, &hdr->ref_seq_start); + cp += itf8_get(cp, &hdr->ref_seq_span); + } + cp += itf8_get(cp, &hdr->num_records); + if (fd->version != CRAM_1_VERS) + cp += itf8_get(cp, &hdr->record_counter); + cp += itf8_get(cp, &hdr->num_blocks); + + cp += itf8_get(cp, &hdr->num_content_ids); + hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t)); + if (!hdr->block_content_ids) { + free(hdr); + return NULL; + } + + for (i = 0; i < hdr->num_content_ids; i++) { + cp += itf8_get(cp, &hdr->block_content_ids[i]); + } + + if (b->content_type == MAPPED_SLICE) { + cp += itf8_get(cp, &hdr->ref_base_id); + } + + if (fd->version != CRAM_1_VERS) { + memcpy(hdr->md5, cp, 16); + } else { + memset(hdr->md5, 0, 16); + } + + return hdr; +} + + +#if 0 +/* Returns the number of bits set in val; it the highest bit used */ +static int nbits(int v) { + static const int MultiplyDeBruijnBitPosition[32] = { + 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, + 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 + }; + + v |= v >> 1; // first up to set all bits 1 after the first 1 */ + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + + // DeBruijn magic to find top bit + return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; +} +#endif + +#if 0 +static int sort_freqs(const void *vp1, const void *vp2) { + const int i1 = *(const int *)vp1; + const int i2 = *(const int *)vp2; + return i1-i2; +} +#endif + +/* ---------------------------------------------------------------------- + * Primary CRAM sequence decoder + */ + +/* + * Internal part of cram_decode_slice(). + * Generates the sequence, quality and cigar components. + */ +static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, + cram_block *blk, cram_record *cr, SAM_hdr *bfd, + int cf, char *seq, char *qual) { + int prev_pos = 0, f, r = 0, out_sz = 1; + int seq_pos = 1; + int cig_len = 0, ref_pos = cr->apos; + int32_t fn, i32; + enum cigar_op cig_op = BAM_CMATCH; + uint32_t *cigar = s->cigar; + uint32_t ncigar = s->ncigar; + uint32_t cigar_alloc = s->cigar_alloc; + uint32_t nm = 0, md_dist = 0; + int orig_aux = 0; + int decode_md = fd->decode_md; + char buf[20]; + + if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { + memset(qual, 30, cr->len); + } + + if (decode_md) { + orig_aux = BLOCK_SIZE(s->aux_blk); + BLOCK_APPEND(s->aux_blk, "MDZ", 3); + } + + if (!c->comp_hdr->FN_codec) return -1; + r |= c->comp_hdr->FN_codec->decode(s,c->comp_hdr->FN_codec, blk, + (char *)&fn, &out_sz); + + ref_pos--; // count from 0 + cr->cigar = ncigar; + for (f = 0; f < fn; f++) { + int32_t pos; + char op; + + if (ncigar+2 >= cigar_alloc) { + cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; + s->cigar = cigar; + if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) + return -1; + } + + if (!c->comp_hdr->FC_codec) return -1; + r |= c->comp_hdr->FC_codec->decode(s, c->comp_hdr->FC_codec, blk, + &op, &out_sz); + if (!c->comp_hdr->FP_codec) return -1; + r |= c->comp_hdr->FP_codec->decode(s, c->comp_hdr->FP_codec, blk, + (char *)&pos, &out_sz); + pos += prev_pos; + + if (pos > seq_pos) { + if (pos > cr->len+1) + return -1; + + if (s->ref && cr->ref_id >= 0) { + if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) { + static int whinged = 0; + if (!whinged) + fprintf(stderr, "Ref pos outside of ref " + "sequence boundary\n"); + whinged = 1; + } else { + memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], + pos - seq_pos); + } + } +#ifdef USE_X + if (cig_len && cig_op != BAM_CBASE_MATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + cig_op = BAM_CBASE_MATCH; +#else + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + cig_op = BAM_CMATCH; +#endif + cig_len += pos - seq_pos; + ref_pos += pos - seq_pos; + md_dist += pos - seq_pos; + seq_pos = pos; + } + + prev_pos = pos; + + switch(op) { + case 'S': { // soft clip: IN + int32_t out_sz2 = 1; + + if (cig_len) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (fd->version == CRAM_1_VERS) { + r |= c->comp_hdr->IN_codec + ? c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, + blk, &seq[pos-1], &out_sz2) + : (seq[pos-1] = 'N', out_sz2 = 1, 0); + } else { + r |= c->comp_hdr->SC_codec + ? c->comp_hdr->SC_codec->decode(s, c->comp_hdr->SC_codec, + blk, &seq[pos-1], &out_sz2) + : (seq[pos-1] = 'N', out_sz2 = 1, 0); + } + cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; + cig_op = BAM_CSOFT_CLIP; + seq_pos += out_sz2; + break; + } + + case 'X': { // Substitution; BS + unsigned char base; +#ifdef USE_X + if (cig_len && cig_op != BAM_CBASE_MISMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->BS_codec) return -1; + r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk, + (char *)&base, &out_sz); + seq[pos-1] = 'N'; // FIXME look up BS=base value + cig_op = BAM_CBASE_MISMATCH; +#else + int ref_base; + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->BS_codec) return -1; + r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk, + (char *)&base, &out_sz); + if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { + seq[pos-1] = 'N'; + } else { + ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]]; + seq[pos-1] = c->comp_hdr->substitution_matrix[ref_base][base]; + if (decode_md) { + BLOCK_APPENDF_2(s->aux_blk, buf, "%d%c", + md_dist, s->ref[ref_pos-s->ref_start +1]); + md_dist = 0; + } + } + cig_op = BAM_CMATCH; +#endif + nm++; + cig_len++; + seq_pos++; + ref_pos++; + break; + } + + case 'D': { // Deletion; DL + if (cig_len && cig_op != BAM_CDEL) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->DL_codec) return -1; + r |= c->comp_hdr->DL_codec->decode(s, c->comp_hdr->DL_codec, blk, + (char *)&i32, &out_sz); + if (decode_md) { + BLOCK_APPENDF_1(s->aux_blk, buf, "%d^", md_dist); + BLOCK_APPEND(s->aux_blk, &s->ref[ref_pos - s->ref_start +1], + i32); + md_dist = 0; + } + cig_op = BAM_CDEL; + cig_len += i32; + ref_pos += i32; + nm += i32; + //printf(" %d: DL = %d (ret %d)\n", f, i32, r); + break; + } + + case 'I': { // Insertion (several bases); IN + int32_t out_sz2 = 1; + + if (cig_len && cig_op != BAM_CINS) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + + if (!c->comp_hdr->IN_codec) return -1; + r |= c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, blk, + &seq[pos-1], &out_sz2); + cig_op = BAM_CINS; + cig_len += out_sz2; + seq_pos += out_sz2; + nm += out_sz2; + //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2); + break; + } + + case 'i': { // Insertion (single base); BA + if (cig_len && cig_op != BAM_CINS) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->BA_codec) return -1; + r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, + (char *)&seq[pos-1], &out_sz); + cig_op = BAM_CINS; + cig_len++; + seq_pos++; + nm++; + //printf(" %d: BA = %c (ret %d)\n", f, seq[pos-1], r); + break; + } + + case 'B': { // Read base; BA, QS +#ifdef USE_X + if (cig_len && cig_op != BAM_CBASE_MISMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } +#else + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } +#endif + if (!c->comp_hdr->BA_codec) return -1; + r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, + (char *)&seq[pos-1], &out_sz); + if (!c->comp_hdr->QS_codec) return -1; + r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk, + (char *)&qual[pos-1], &out_sz); +#ifdef USE_X + cig_op = BAM_CBASE_MISMATCH; +#else + cig_op = BAM_CMATCH; +#endif + cig_len++; + seq_pos++; + ref_pos++; + //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r); + break; + } + + case 'Q': { // Quality score; QS + if (!c->comp_hdr->QS_codec) return -1; + r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk, + (char *)&qual[pos-1], &out_sz); + //printf(" %d: QS = %d (ret %d)\n", f, qc, r); + break; + } + + case 'H': { // hard clip; HC + if (cig_len && cig_op != BAM_CHARD_CLIP) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->HC_codec) return -1; + r |= c->comp_hdr->HC_codec->decode(s, c->comp_hdr->HC_codec, blk, + (char *)&i32, &out_sz); + cig_op = BAM_CHARD_CLIP; + cig_len += i32; + nm += i32; + break; + } + + case 'P': { // padding; PD + if (cig_len && cig_op != BAM_CPAD) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->PD_codec) return -1; + r |= c->comp_hdr->PD_codec->decode(s, c->comp_hdr->PD_codec, blk, + (char *)&i32, &out_sz); + cig_op = BAM_CPAD; + cig_len += i32; + nm += i32; + break; + } + + case 'N': { // Ref skip; RS + if (cig_len && cig_op != BAM_CREF_SKIP) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + if (!c->comp_hdr->RS_codec) return -1; + r |= c->comp_hdr->RS_codec->decode(s, c->comp_hdr->RS_codec, blk, + (char *)&i32, &out_sz); + cig_op = BAM_CREF_SKIP; + cig_len += i32; + ref_pos += i32; + nm += i32; + break; + } + + default: + abort(); + } + } + + /* An implement match op for any unaccounted for bases */ + if (cr->len >= seq_pos) { + if (s->ref) { + if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) { + static int whinged = 0; + if (!whinged) + fprintf(stderr, "Ref pos outside of ref sequence boundary\n"); + whinged = 1; + } else { + memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], + cr->len - seq_pos + 1); + ref_pos += cr->len - seq_pos + 1; + md_dist += cr->len - seq_pos + 1; + } + } + + if (ncigar+1 >= cigar_alloc) { + cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; + s->cigar = cigar; + if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) + return -1; + } +#ifdef USE_X + if (cig_len && cig_op != BAM_CBASE_MATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + cig_op = BAM_CBASE_MATCH; +#else + if (cig_len && cig_op != BAM_CMATCH) { + cigar[ncigar++] = (cig_len<<4) + cig_op; + cig_len = 0; + } + cig_op = BAM_CMATCH; +#endif + cig_len += cr->len - seq_pos+1; + } + if (decode_md) { + BLOCK_APPENDF_1(s->aux_blk, buf, "%d", md_dist); + } + + if (cig_len) { + if (ncigar >= cigar_alloc) { + cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; + s->cigar = cigar; + if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) + return -1; + } + + cigar[ncigar++] = (cig_len<<4) + cig_op; + } + + cr->ncigar = ncigar - cr->cigar; + cr->aend = ref_pos; + + //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); + + if (!c->comp_hdr->MQ_codec) return -1; + r |= c->comp_hdr->MQ_codec->decode(s, c->comp_hdr->MQ_codec, blk, + (char *)&cr->mqual, &out_sz); + + if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) { + int32_t out_sz2 = cr->len; + + if (!c->comp_hdr->Qs_codec) return -1; + r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, blk, + qual, &out_sz2); + } + + s->cigar = cigar; + s->cigar_alloc = cigar_alloc; + s->ncigar = ncigar; + + if (decode_md) { + char buf[7]; + BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: + cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux; + buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + buf[5] = (nm>>16) & 0xff; + buf[6] = (nm>>24) & 0xff; + BLOCK_APPEND(s->aux_blk, buf, 7); + cr->aux_size += 7; + } + + return r; +} + +/* + * Quick and simple hash lookup for cram_map arrays + */ +static cram_map *map_find(cram_map **map, unsigned char *key, int id) { + cram_map *m; + + m = map[CRAM_MAP(key[0],key[1])]; + while (m && m->key != id) + m= m->next; + + return m; +} + +//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next + + +static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, + cram_block *blk, cram_record *cr) { + int i, r = 0, out_sz = 1; + unsigned char ntags; + + if (!c->comp_hdr->TC_codec) return -1; + r |= c->comp_hdr->TC_codec->decode(s, c->comp_hdr->TC_codec, blk, + (char *)&ntags, &out_sz); + cr->ntags = ntags; + + //printf("TC=%d\n", cr->ntags); + cr->aux_size = 0; + cr->aux = BLOCK_SIZE(s->aux_blk); + + for (i = 0; i < cr->ntags; i++) { + int32_t id, out_sz = 1; + unsigned char tag_data[3]; + cram_map *m; + + //printf("Tag %d/%d\n", i+1, cr->ntags); + if (!c->comp_hdr->TN_codec) return -1; + r |= c->comp_hdr->TN_codec->decode(s, c->comp_hdr->TN_codec, + blk, (char *)&id, &out_sz); + if (out_sz == 3) { + tag_data[0] = ((char *)&id)[0]; + tag_data[1] = ((char *)&id)[1]; + tag_data[2] = ((char *)&id)[2]; + } else { + tag_data[0] = (id>>16) & 0xff; + tag_data[1] = (id>>8) & 0xff; + tag_data[2] = id & 0xff; + } + + m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); + if (!m) + return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + + if (!m->codec) return -1; + r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); + + cr->aux_size += out_sz + 3; + } + + return r; +} + +static int cram_decode_aux(cram_container *c, cram_slice *s, + cram_block *blk, cram_record *cr) { + int i, r = 0, out_sz = 1; + int32_t TL; + unsigned char *TN; + + if (!c->comp_hdr->TL_codec) return -1; + r |= c->comp_hdr->TL_codec->decode(s, c->comp_hdr->TL_codec, blk, + (char *)&TL, &out_sz); + if (r || TL < 0 || TL >= c->comp_hdr->nTL) + return -1; + + TN = c->comp_hdr->TL[TL]; + cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen + + //printf("TC=%d\n", cr->ntags); + cr->aux_size = 0; + cr->aux = BLOCK_SIZE(s->aux_blk); + + for (i = 0; i < cr->ntags; i++) { + int32_t id, out_sz = 1; + unsigned char tag_data[3]; + cram_map *m; + + //printf("Tag %d/%d\n", i+1, cr->ntags); + tag_data[0] = *TN++; + tag_data[1] = *TN++; + tag_data[2] = *TN++; + id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; + + m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); + if (!m) + return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + + if (!m->codec) return -1; + r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); + cr->aux_size += out_sz + 3; + } + + return r; +} + +/* Resolve mate pair cross-references between recs within this slice */ +static void cram_decode_slice_xref(cram_slice *s) { + int rec; + + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + + if (cr->mate_line >= 0) { + if (cr->mate_line < s->hdr->num_records) { + /* + * On the first read, loop through computing lengths. + * It's not perfect as we have one slice per reference so we + * cannot detect when TLEN should be zero due to seqs that + * map to multiple references. + * + * We also cannot set tlen correct when it spans a slice for + * other reasons. This may make tlen too small. Should we + * fix this by forcing TLEN to be stored verbatim in such cases? + * + * Or do we just admit defeat and output 0 for tlen? It's the + * safe option... + */ + if (cr->tlen == INT_MIN) { + int id1 = rec, id2 = rec; + int aleft = cr->apos, aright = cr->aend; + int tlen; + int ref = cr->ref_id; + + do { + if (aleft > s->crecs[id2].apos) + aleft = s->crecs[id2].apos; + if (aright < s->crecs[id2].aend) + aright = s->crecs[id2].aend; + if (s->crecs[id2].mate_line == -1) { + s->crecs[id2].mate_line = rec; + break; + } + assert(s->crecs[id2].mate_line > id2); + id2 = s->crecs[id2].mate_line; + + if (s->crecs[id2].ref_id != ref) + ref = -1; + } while (id2 != id1); + + if (ref != -1) { + tlen = aright - aleft + 1; + id1 = id2 = rec; + + /* + * When we have two seqs with identical start and + * end coordinates, set +/- tlen based on 1st/last + * bit flags instead, as a tie breaker. + */ + if (s->crecs[id2].apos == aleft) { + if (s->crecs[id2].aend != aright) + s->crecs[id2].tlen = tlen; + else if (s->crecs[id2].flags & BAM_FREAD1) + s->crecs[id2].tlen = tlen; + else + s->crecs[id2].tlen = -tlen; + } else { + s->crecs[id2].tlen = -tlen; + } + + id2 = s->crecs[id2].mate_line; + while (id2 != id1) { + if (s->crecs[id2].apos == aleft) { + if (s->crecs[id2].aend != aright) + s->crecs[id2].tlen = tlen; + else if (s->crecs[id2].flags & BAM_FREAD1) + s->crecs[id2].tlen = tlen; + else + s->crecs[id2].tlen = -tlen; + } else { + s->crecs[id2].tlen = -tlen; + } + id2 = s->crecs[id2].mate_line; + } + } else { + id1 = id2 = rec; + + s->crecs[id2].tlen = 0; + id2 = s->crecs[id2].mate_line; + while (id2 != id1) { + s->crecs[id2].tlen = 0; + id2 = s->crecs[id2].mate_line; + } + } + } + + cr->mate_pos = s->crecs[cr->mate_line].apos; + cr->mate_ref_id = s->crecs[cr->mate_line].ref_id; + + // paired + cr->flags |= BAM_FPAIRED; + + // set mate unmapped if needed + if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) { + cr->flags |= BAM_FMUNMAP; + cr->tlen = 0; + } + if (cr->flags & BAM_FUNMAP) { + cr->tlen = 0; + } + + // set mate reversed if needed + if (s->crecs[cr->mate_line].flags & BAM_FREVERSE) + cr->flags |= BAM_FMREVERSE; + } else { + fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n", + cr->mate_line, s->hdr->num_records-1); + } + + /* FIXME: construct read names here too if needed */ + } else { + if (cr->mate_flags & CRAM_M_REVERSE) { + cr->flags |= BAM_FPAIRED | BAM_FMREVERSE; + } + if (cr->mate_flags & CRAM_M_UNMAP) { + cr->flags |= BAM_FMUNMAP; + //cr->mate_ref_id = -1; + } + if (!(cr->flags & BAM_FPAIRED)) + cr->mate_ref_id = -1; + } + + if (cr->tlen == INT_MIN) + cr->tlen = 0; // Just incase + } +} + +static char *md5_print(unsigned char *md5, char *out) { + int i; + for (i = 0; i < 16; i++) { + out[i*2+0] = "0123456789abcdef"[md5[i]>>4]; + out[i*2+1] = "0123456789abcdef"[md5[i]&15]; + } + out[32] = 0; + + return out; +} + +/* + * Decode an entire slice from container blocks. Fills out s->crecs[] array. + * Returns 0 on success + * -1 on failure + */ +int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, + SAM_hdr *bfd) { + cram_block *blk = s->block[0]; + int32_t bf, ref_id; + unsigned char cf; + int out_sz, r = 0; + int rec; + char *seq, *qual; + int unknown_rg = -1; + int id, embed_ref; + char **refs = NULL; + + for (id = 0; id < s->hdr->num_blocks; id++) { + if (cram_uncompress_block(s->block[id])) + return -1; + } + + blk->bit = 7; // MSB first + + /* Look for unknown RG, added as last by Java CRAM? */ + if (bfd->nrg > 0 && + !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN")) + unknown_rg = bfd->nrg-1; + + if (blk->content_type != CORE) + return -1; + + if (s->crecs) + free(s->crecs); + if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs)))) + return -1; + + ref_id = s->hdr->ref_seq_id; + embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + + if (ref_id >= 0) { + if (embed_ref) { + cram_block *b; + if (s->hdr->ref_base_id < 0) { + fprintf(stderr, "No reference specified and " + "no embedded reference is available.\n"); + return -1; + } + if (!s->block_by_id || + !(b = s->block_by_id[s->hdr->ref_base_id])) + return -1; + s->ref = (char *)BLOCK_DATA(b); + s->ref_start = s->hdr->ref_seq_start; + s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; + } else if (!fd->no_ref) { + //// Avoid Java cramtools bug by loading entire reference seq + //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0); + //s->ref_start = 1; + + s->ref = + cram_get_ref(fd, s->hdr->ref_seq_id, + s->hdr->ref_seq_start, + s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); + s->ref_start = s->hdr->ref_seq_start; + s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; + + /* Sanity check */ + if (s->ref_start < 0) { + fprintf(stderr, "Slice starts before base 1.\n"); + s->ref_start = 0; + } + pthread_mutex_lock(&fd->ref_lock); + pthread_mutex_lock(&fd->refs->lock); + if (s->ref_end > fd->refs->ref_id[ref_id]->length) { + fprintf(stderr, "Slice ends beyond reference end.\n"); + s->ref_end = fd->refs->ref_id[ref_id]->length; + } + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + } + } + + if (s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) { + fprintf(stderr, "Unable to fetch reference #%d %d..%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); + return -1; + } + + if (fd->version != CRAM_1_VERS && s->hdr->ref_seq_id >= 0 + && !fd->ignore_md5 + && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) { + MD5_CTX md5; + unsigned char digest[16]; + + if (s->ref && s->hdr->ref_seq_id >= 0) { + int start, len; + + if (s->hdr->ref_seq_start >= s->ref_start) { + start = s->hdr->ref_seq_start - s->ref_start; + } else { + fprintf(stderr, "Slice starts before base 1.\n"); + start = 0; + } + + if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) { + len = s->hdr->ref_seq_span; + } else { + fprintf(stderr, "Slice ends beyond reference end.\n"); + len = s->ref_end - s->ref_start + 1; + } + + MD5_Init(&md5); + if (start + len > s->ref_end - s->ref_start + 1) + len = s->ref_end - s->ref_start + 1 - start; + if (len >= 0) + MD5_Update(&md5, s->ref + start, len); + MD5_Final(digest, &md5); + } else if (!s->ref && s->hdr->ref_base_id >= 0) { + cram_block *b; + if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) { + MD5_Init(&md5); + MD5_Update(&md5, b->data, b->uncomp_size); + MD5_Final(digest, &md5); + } + } + + if ((!s->ref && s->hdr->ref_base_id < 0) + || memcmp(digest, s->hdr->md5, 16) != 0) { + char M[33]; + fprintf(stderr, "ERROR: md5sum reference mismatch for ref " + "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end); + fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M)); + fprintf(stderr, "Ref : %s\n", md5_print(digest, M)); + return -1; + } + } + + if (ref_id == -2) { + pthread_mutex_lock(&fd->ref_lock); + pthread_mutex_lock(&fd->refs->lock); + refs = calloc(fd->refs->nref, sizeof(char *)); + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + if (!refs) + return -1; + } + + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + + //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit); + + cr->s = s; + + out_sz = 1; /* decode 1 item */ + if (!c->comp_hdr->BF_codec) return -1; + r |= c->comp_hdr->BF_codec->decode(s, c->comp_hdr->BF_codec, blk, + (char *)&bf, &out_sz); + if (bf < 0 || + bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap)) + return -1; + bf = fd->bam_flag_swap[bf]; + cr->flags = bf; + + if (fd->version == CRAM_1_VERS) { + /* CF is byte in 1.0, int32 in 2.0 */ + if (!c->comp_hdr->CF_codec) return -1; + r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk, + (char *)&cf, &out_sz); + cr->cram_flags = cf; + } else { + if (!c->comp_hdr->CF_codec) return -1; + r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk, + (char *)&cr->cram_flags, + &out_sz); + cf = cr->cram_flags; + } + + if (fd->version != CRAM_1_VERS && ref_id == -2) { + if (!c->comp_hdr->RI_codec) return -1; + r |= c->comp_hdr->RI_codec->decode(s, c->comp_hdr->RI_codec, blk, + (char *)&cr->ref_id, &out_sz); + if (cr->ref_id >= 0) { + if (!fd->no_ref) { + if (!refs[cr->ref_id]) + refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0); + s->ref = refs[cr->ref_id]; + } + s->ref_start = 1; + pthread_mutex_lock(&fd->ref_lock); + pthread_mutex_lock(&fd->refs->lock); + s->ref_end = fd->refs->ref_id[cr->ref_id]->length; + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + } + } else { + cr->ref_id = ref_id; // Forced constant in CRAM 1.0 + } + + + if (!c->comp_hdr->RL_codec) return -1; + r |= c->comp_hdr->RL_codec->decode(s, c->comp_hdr->RL_codec, blk, + (char *)&cr->len, &out_sz); + + if (!c->comp_hdr->AP_codec) return -1; + r |= c->comp_hdr->AP_codec->decode(s, c->comp_hdr->AP_codec, blk, + (char *)&cr->apos, &out_sz); + if (c->comp_hdr->AP_delta) + cr->apos += s->last_apos; + s->last_apos= cr->apos; + + if (!c->comp_hdr->RG_codec) return -1; + r |= c->comp_hdr->RG_codec->decode(s, c->comp_hdr->RG_codec, blk, + (char *)&cr->rg, &out_sz); + if (cr->rg == unknown_rg) + cr->rg = -1; + + cr->name_len = 0; + + if (c->comp_hdr->read_names_included) { + int32_t out_sz2 = 1; + + // Read directly into name cram_block + cr->name = BLOCK_SIZE(s->name_blk); + if (!c->comp_hdr->RN_codec) return -1; + r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, blk, + (char *)s->name_blk, &out_sz2); + cr->name_len = out_sz2; + } + + cr->mate_line = -1; + cr->mate_ref_id = -1; + if (cf & CRAM_FLAG_DETACHED) { + if (fd->version == CRAM_1_VERS) { + /* MF is byte in 1.0, int32 in 2.0 */ + unsigned char mf; + if (!c->comp_hdr->MF_codec) return -1; + r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec, + blk, (char *)&mf, &out_sz); + cr->mate_flags = mf; + } else { + if (!c->comp_hdr->MF_codec) return -1; + r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec, + blk, + (char *)&cr->mate_flags, + &out_sz); + } + + if (!c->comp_hdr->read_names_included) { + int32_t out_sz2 = 1; + + // Read directly into name cram_block + cr->name = BLOCK_SIZE(s->name_blk); + if (!c->comp_hdr->RN_codec) return -1; + r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, + blk, (char *)s->name_blk, + &out_sz2); + cr->name_len = out_sz2; + } + + if (!c->comp_hdr->NS_codec) return -1; + r |= c->comp_hdr->NS_codec->decode(s, c->comp_hdr->NS_codec, blk, + (char *)&cr->mate_ref_id, &out_sz); + +// Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. +// if (cr->mate_ref_id == -1 && cr->flags & 0x01) { +// /* Paired, but unmapped */ +// cr->flags |= BAM_FMUNMAP; +// } + + if (!c->comp_hdr->NP_codec) return -1; + r |= c->comp_hdr->NP_codec->decode(s, c->comp_hdr->NP_codec, blk, + (char *)&cr->mate_pos, &out_sz); + if (!c->comp_hdr->TS_codec) return -1; + r |= c->comp_hdr->TS_codec->decode(s, c->comp_hdr->TS_codec, blk, + (char *)&cr->tlen, &out_sz); + } else if (cf & CRAM_FLAG_MATE_DOWNSTREAM) { + if (!c->comp_hdr->NF_codec) return -1; + r |= c->comp_hdr->NF_codec->decode(s, c->comp_hdr->NF_codec, blk, + (char *)&cr->mate_line, &out_sz); + cr->mate_line += rec + 1; + + //cr->name_len = sprintf(name, "%d", name_id++); + //cr->name = DSTRING_LEN(name_ds); + //dstring_nappend(name_ds, name, cr->name_len); + + cr->mate_ref_id = -1; + cr->tlen = INT_MIN; + cr->mate_pos = 0; + } else { + cr->mate_flags = 0; + cr->tlen = INT_MIN; + } + /* + else if (!name[0]) { + //name[0] = '?'; name[1] = 0; + //cr->name_len = 1; + //cr->name= DSTRING_LEN(s->name_ds); + //dstring_nappend(s->name_ds, "?", 1); + + cr->mate_ref_id = -1; + cr->tlen = 0; + cr->mate_pos = 0; + } + */ + + /* Auxiliary tags */ + if (fd->version == CRAM_1_VERS) + r |= cram_decode_aux_1_0(c, s, blk, cr); + else + r |= cram_decode_aux(c, s, blk, cr); + + /* Fake up dynamic string growth and appending */ + cr->seq = BLOCK_SIZE(s->seqs_blk); + BLOCK_GROW(s->seqs_blk, cr->len); + seq = (char *)BLOCK_END(s->seqs_blk); + BLOCK_SIZE(s->seqs_blk) += cr->len; + + if (!seq) + return -1; + + cr->qual = BLOCK_SIZE(s->qual_blk); + BLOCK_GROW(s->qual_blk, cr->len); + qual = (char *)BLOCK_END(s->qual_blk); + BLOCK_SIZE(s->qual_blk) += cr->len; + + if (!s->ref) + memset(seq, '=', cr->len); + + if (!(bf & BAM_FUNMAP)) { + /* Decode sequence and generate CIGAR */ + r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual); + } else { + int out_sz2 = cr->len; + + //puts("Unmapped"); + cr->cigar = 0; + cr->ncigar = 0; + cr->aend = cr->apos; + cr->mqual = 0; + + if (!c->comp_hdr->BA_codec) return -1; + r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk, + (char *)seq, &out_sz2); + + if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) { + out_sz2 = cr->len; + if (!c->comp_hdr->Qs_codec) return -1; + r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, + blk, qual, &out_sz2); + } else { + memset(qual, 30, cr->len); + } + } + } + + pthread_mutex_lock(&fd->ref_lock); + if (refs) { + int i; + for (i = 0; i < fd->refs->nref; i++) { + if (refs[i]) + cram_ref_decr(fd->refs, i); + } + free(refs); + } else if (ref_id >= 0 && s->ref != fd->ref_free) { + cram_ref_decr(fd->refs, ref_id); + } + pthread_mutex_unlock(&fd->ref_lock); + + /* Resolve mate pair cross-references between recs within this slice */ + cram_decode_slice_xref(s); + + return r; +} + +typedef struct { + cram_fd *fd; + cram_container *c; + cram_slice *s; + SAM_hdr *h; + int exit_code; +} cram_decode_job; + +void *cram_decode_slice_thread(void *arg) { + cram_decode_job *j = (cram_decode_job *)arg; + + j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h); + + return j; +} + +/* + * Spawn a multi-threaded version of cram_decode_slice(). + */ +int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, + SAM_hdr *bfd) { + cram_decode_job *j; + int nonblock; + + if (!fd->pool) + return cram_decode_slice(fd, c, s, bfd); + + if (!(j = malloc(sizeof(*j)))) + return -1; + + j->fd = fd; + j->c = c; + j->s = s; + j->h = bfd; + + nonblock = t_pool_results_queue_len(fd->rqueue) ? 0 : 1; + + if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread, + j, nonblock)) { + /* Would block */ + fd->job_pending = j; + } else { + fd->job_pending = NULL; + } + + // flush too + return 0; +} + + +/* ---------------------------------------------------------------------- + * CRAM sequence iterators. + */ + +/* + * Converts a cram in-memory record into a bam in-memory record. We + * pass a pointer to a bam_seq_t pointer along with the a pointer to + * the allocated size. These can initially be pointers to NULL and zero. + * + * This function will reallocate the bam buffer as required and update + * (*bam)->alloc accordingly, allowing it to be used within a loop + * efficiently without needing to allocate new bam objects over and + * over again. + * + * Returns the used size of the bam record on success + * -1 on failure. + */ +static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, + cram_record *cr, int rec, bam_seq_t **bam) { + int bam_idx, rg_len; + char name_a[1024], *name; + int name_len; + char *aux, *aux_orig; + + /* Assign names if not explicitly set */ + if (cr->name_len) { + name = (char *)BLOCK_DATA(s->name_blk) + cr->name; + name_len = cr->name_len; + } else { + // FIXME: add prefix, container number, slice number, etc + name = name_a; + + if (cr->mate_line >= 0 && cr->mate_line < rec) + name_len = sprintf(name_a, "%s:%"PRId64":%d", + fd->prefix, s->id, cr->mate_line); + else + name_len = sprintf(name_a, "%s:%"PRId64":%d", + fd->prefix, s->id, rec); + } + + /* Generate BAM record */ + if (cr->rg < -1 || cr->rg >= bfd->nrg) + return -1; + rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0; + + if (!BLOCK_DATA(s->seqs_blk)) + return -1; + if (!BLOCK_DATA(s->qual_blk)) + return -1; + + bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, + name, name_len, + cr->flags, + cr->ref_id, + cr->apos, + cr->aend, + cr->mqual, + cr->ncigar, &s->cigar[cr->cigar], + cr->mate_ref_id, + cr->mate_pos, + cr->tlen, + cr->len, + (char *)BLOCK_DATA(s->seqs_blk) + cr->seq, + (char *)BLOCK_DATA(s->qual_blk) + cr->qual); + if (bam_idx == -1) + return -1; + + aux = aux_orig = (char *)bam_aux(*bam); + + /* Auxiliary strings */ + if (cr->aux_size != 0) { + memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); + aux += cr->aux_size; + } + + /* RG:Z: */ + if (cr->rg != -1) { + int len = bfd->rg[cr->rg].name_len; + *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; + memcpy(aux, bfd->rg[cr->rg].name, len); + aux += len; + *aux++ = 0; + } + +#ifndef SAMTOOLS + bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig)); +#endif + + *aux++ = 0; + + return bam_idx + (aux - aux_orig); +} + +/* + * Here be dragons! The multi-threading code in this is crufty beyond belief. + */ +static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { + cram_container *c; + cram_slice *s = NULL; + + fd->eof = 0; + + if (!(c = fd->ctr)) { + // Load first container. + do { + if (!(c = fd->ctr = cram_read_container(fd))) + return NULL; + } while (c->length == 0); + + /* + * The first container may be a result of a sub-range query. + * In which case it may still not be the optimal starting point + * due to skipped containers/slices in the index. + */ + if (fd->range.refid != -2) { + while (c->ref_seq_id != -2 && + (c->ref_seq_id < fd->range.refid || + c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) { + if (0 != cram_seek(fd, c->length, SEEK_CUR)) + return NULL; + cram_free_container(fd->ctr); + do { + if (!(c = fd->ctr = cram_read_container(fd))) + return NULL; + } while (c->length == 0); + } + + if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) + return NULL; + } + + if (!(c->comp_hdr_block = cram_read_block(fd))) + return NULL; + if (c->comp_hdr_block->content_type != COMPRESSION_HEADER) + return NULL; + + c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); + if (!c->comp_hdr) + return NULL; + if (!c->comp_hdr->AP_delta) { + pthread_mutex_lock(&fd->ref_lock); + fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + } + } + + if ((s = c->slice)) + cram_free_slice(s); + + if (c->curr_slice == c->max_slice) { + cram_free_container(c); + c = NULL; + } + + /* Sorry this is so contorted! */ + for (;;) { + if (fd->job_pending) { + cram_decode_job *j = (cram_decode_job *)fd->job_pending; + c = j->c; + s = j->s; + free(fd->job_pending); + fd->job_pending = NULL; + } else if (!fd->ooc) { + empty_container: + if (!c || c->curr_slice == c->max_slice) { + // new container + do { + if (!(c = fd->ctr = cram_read_container(fd))) { + if (fd->pool) { + fd->ooc = 1; + break; + } + + return NULL; + } + } while (c->length == 0); + if (fd->ooc) + break; + + /* Skip containers not yet spanning our range */ + if (fd->range.refid != -2 && c->ref_seq_id != -2) { + if (c->ref_seq_id != fd->range.refid) { + fd->eof = 1; + return NULL; + } + + if (c->ref_seq_start > fd->range.end) { + fd->eof = 1; + return NULL; + } + + if (c->ref_seq_start + c->ref_seq_span-1 < + fd->range.start) { + c->curr_rec = c->max_rec; + c->curr_slice = c->max_slice; + cram_seek(fd, c->length, SEEK_CUR); + cram_free_container(c); + c = NULL; + continue; + } + } + + if (!(c->comp_hdr_block = cram_read_block(fd))) + return NULL; + if (c->comp_hdr_block->content_type != COMPRESSION_HEADER) + return NULL; + + c->comp_hdr = + cram_decode_compression_header(fd, c->comp_hdr_block); + if (!c->comp_hdr) + return NULL; + + if (!c->comp_hdr->AP_delta) { + pthread_mutex_lock(&fd->ref_lock); + fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + } + } + + if (c->num_records == 0) { + cram_free_container(c); c = NULL; + goto empty_container; + } + + + if (!(s = c->slice = cram_read_slice(fd))) + return NULL; + c->curr_slice++; + c->curr_rec = 0; + c->max_rec = s->hdr->num_records; + + s->last_apos = s->hdr->ref_seq_start; + + /* Skip slices not yet spanning our range */ + if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) { + if (s->hdr->ref_seq_id != fd->range.refid) { + fd->eof = 1; + cram_free_slice(s); + c->slice = NULL; + return NULL; + } + + if (s->hdr->ref_seq_start > fd->range.end) { + fd->eof = 1; + cram_free_slice(s); + c->slice = NULL; + return NULL; + } + + if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 < + fd->range.start) { + cram_free_slice(s); + c->slice = NULL; + cram_free_container(c); + c = NULL; + continue; + } + } + } + + /* Test decoding of 1st seq */ + if (!c || !s) + break; + + if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) { + // if (cram_decode_slice(fd, c, s, fd->header) != 0) { + fprintf(stderr, "Failure to decode slice\n"); + cram_free_slice(s); + c->slice = NULL; + return NULL; + } + + if (!fd->pool || fd->job_pending) + break; + + if (t_pool_results_queue_sz(fd->rqueue) > fd->pool->qsize) + break; + } + + if (fd->pool) { + t_pool_result *res; + cram_decode_job *j; + +// fprintf(stderr, "Thread pool len = %d, %d\n", +// t_pool_results_queue_len(fd->rqueue), +// t_pool_results_queue_sz(fd->rqueue)); + + if (fd->ooc && t_pool_results_queue_empty(fd->rqueue)) + return NULL; + + res = t_pool_next_result_wait(fd->rqueue); + + if (!res || !res->data) { + fprintf(stderr, "t_pool_next_result failure\n"); + return NULL; + } + + j = (cram_decode_job *)res->data; + c = j->c; + s = j->s; + + t_pool_delete_result(res, 1); + } + + *cp = c; + return s; +} + +/* + * Read the next cram record and return it. + * Note that to decode cram_record the caller will need to look up some data + * in the current slice, pointed to by fd->ctr->slice. This is valid until + * the next call to cram_get_seq (which may invalidate it). + * + * Returns record pointer on success (do not free) + * NULL on failure + */ +cram_record *cram_get_seq(cram_fd *fd) { + cram_container *c; + cram_slice *s; + + for (;;) { + c = fd->ctr; + if (c && c->slice && c->curr_rec < c->max_rec) { + s = c->slice; + } else { + if (!(s = cram_next_slice(fd, &c))) + return NULL; + } + + if (fd->range.refid != -2) { + if (s->crecs[c->curr_rec].ref_id < fd->range.refid) { + c->curr_rec++; + continue; + } + + if (s->crecs[c->curr_rec].ref_id != fd->range.refid) { + fd->eof = 1; + cram_free_slice(s); + c->slice = NULL; + return NULL; + } + + if (s->crecs[c->curr_rec].apos > fd->range.end) { + fd->eof = 1; + cram_free_slice(s); + c->slice = NULL; + return NULL; + } + + if (s->crecs[c->curr_rec].aend < fd->range.start) { + c->curr_rec++; + continue; + } + } + + break; + } + + fd->ctr = c; + c->slice = s; + return &s->crecs[c->curr_rec++]; +} + +/* + * Read the next cram record and convert it to a bam_seq_t struct. + * + * Returns 0 on success + * -1 on EOF or failure (check fd->err) + */ +int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) { + cram_record *cr; + cram_container *c; + cram_slice *s; + + if (!(cr = cram_get_seq(fd))) + return -1; + + c = fd->ctr; + s = c->slice; + + return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam); +} diff --git a/star-sys/STAR/source/htslib/cram/cram_decode.h b/star-sys/STAR/source/htslib/cram/cram_decode.h new file mode 100644 index 0000000..64b188e --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_decode.h @@ -0,0 +1,112 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * Include cram.h instead. + * + * This is an internal part of the CRAM system and is automatically included + * when you #include cram.h. + * + * Implements the decoding portion of CRAM I/O. Also see + * cram_codecs.[ch] for the actual encoding functions themselves. + */ + +#ifndef _CRAM_READ_H_ +#define _CRAM_READ_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---------------------------------------------------------------------- + * CRAM sequence iterators. + */ + +/*! Read the next cram record and return it as a cram_record. + * + * Note that to decode cram_record the caller will need to look up some data + * in the current slice, pointed to by fd->ctr->slice. This is valid until + * the next call to cram_get_seq (which may invalidate it). + * + * @return + * Returns record pointer on success (do not free); + * NULL on failure + */ +cram_record *cram_get_seq(cram_fd *fd); + +/*! Read the next cram record and convert it to a bam_seq_t struct. + * + * @return + * Returns 0 on success; + * -1 on EOF or failure (check fd->err) + */ +int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam); + + +/* ---------------------------------------------------------------------- + * Internal functions + */ + +/*! INTERNAL: + * Decodes a CRAM block compression header. + * + * @return + * Returns header ptr on success; + * NULL on failure + */ +cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, + cram_block *b); + +/*! INTERNAL: + * Decodes a CRAM (un)mapped slice header block. + * + * @return + * Returns slice header ptr on success; + * NULL on failure + */ +cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); + + +/*! INTERNAL: + * Decode an entire slice from container blocks. Fills out s->crecs[] array. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, + SAM_hdr *hdr); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/cram/cram_encode.c b/star-sys/STAR/source/htslib/cram/cram_encode.c new file mode 100644 index 0000000..94c2ceb --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_encode.c @@ -0,0 +1,2630 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cram/cram.h" +#include "cram/os.h" +#include "cram/md5.h" + +#ifdef SAMTOOLS +# define bam_copy(dst, src) bam_copy1(*(dst), (src)) +#else +void bam_copy(bam_seq_t **bt, bam_seq_t *bf) { + size_t a; + + if (bf->alloc > (*bt)->alloc) { + a = ((int)((bf->alloc+15)/16))*16; + *bt = realloc(*bt, a); + memcpy(*bt, bf, bf->alloc); + } else { + a = (*bt)->alloc; + memcpy(*bt, bf, bf->alloc); + } + + (*bt)->alloc = a; +} +#endif + +#define Z_CRAM_STRAT Z_FILTERED +//#define Z_CRAM_STRAT Z_RLE +//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY +//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY + +static int process_one_read(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *cr, + bam_seq_t *b, int rnum); + +/* + * Returns index of val into key. + * Basically strchr(key, val)-key; + */ +static int sub_idx(char *key, char val) { + int i; + + for (i = 0; *key && *key++ != val; i++); + return i; +} + +/* + * Encodes a compression header block into a generic cram_block structure. + * + * Returns cram_block ptr on success + * NULL on failure + */ +cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, + cram_block_compression_hdr *h) { + cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); + cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); + int i, mc; + + if (!cb || !map) + return NULL; + + /* + * This is a concatenation of several blocks of data: + * header + landmarks, preservation map, read encoding map, and the tag + * encoding map. + * All 4 are variable sized and we need to know how large these are + * before creating the compression header itself as this starts with + * the total size (stored as a variable length string). + */ + + // Duplicated from container itself, and removed in 1.1 + if (fd->version == CRAM_1_VERS) { + itf8_put_blk(cb, h->ref_seq_id); + itf8_put_blk(cb, h->ref_seq_start); + itf8_put_blk(cb, h->ref_seq_span); + itf8_put_blk(cb, h->num_records); + itf8_put_blk(cb, h->num_landmarks); + for (i = 0; i < h->num_landmarks; i++) { + itf8_put_blk(cb, h->landmark[i]); + } + } + + /* Create in-memory preservation map */ + /* FIXME: should create this when we create the container */ + { + khint_t k; + int r; + + if (!(h->preservation_map = kh_init(map))) + return NULL; + + k = kh_put(map, h->preservation_map, "RN", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 1; + + if (fd->version == CRAM_1_VERS) { + k = kh_put(map, h->preservation_map, "PI", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 0; + + k = kh_put(map, h->preservation_map, "UI", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 1; + + k = kh_put(map, h->preservation_map, "MI", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 1; + + } else { + // Technically SM was in 1.0, but wasn't in Java impl. + k = kh_put(map, h->preservation_map, "SM", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 0; + + k = kh_put(map, h->preservation_map, "TD", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 0; + + k = kh_put(map, h->preservation_map, "AP", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = c->pos_sorted; + + if (fd->no_ref || fd->embed_ref) { + // Reference Required == No + k = kh_put(map, h->preservation_map, "RR", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = 0; + } + } + } + + /* Encode preservation map; could collapse this and above into one */ + mc = 0; + BLOCK_SIZE(map) = 0; + if (h->preservation_map) { + khint_t k; + + for (k = kh_begin(h->preservation_map); + k != kh_end(h->preservation_map); + k++) { + const char *key; + khash_t(map) *pmap = h->preservation_map; + + + if (!kh_exist(pmap, k)) + continue; + + key = kh_key(pmap, k); + BLOCK_APPEND(map, key, 2); + + switch(CRAM_KEY(key[0], key[1])) { + case CRAM_KEY('M','I'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('U','I'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('P','I'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('A','P'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('R','N'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('R','R'): + BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); + break; + + case CRAM_KEY('S','M'): { + char smat[5], *mp = smat; + *mp++ = + (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | + (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | + (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | + (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); + *mp++ = + (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | + (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | + (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | + (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); + *mp++ = + (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | + (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | + (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | + (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); + *mp++ = + (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | + (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | + (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | + (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); + *mp++ = + (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | + (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | + (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | + (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); + BLOCK_APPEND(map, smat, 5); + break; + } + + case CRAM_KEY('T','D'): { + itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); + BLOCK_APPEND(map, + BLOCK_DATA(h->TD_blk), + BLOCK_SIZE(h->TD_blk)); + break; + } + + default: + fprintf(stderr, "Unknown preservation key '%.2s'\n", key); + break; + } + + mc++; + } + } + itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + itf8_put_blk(cb, mc); + BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); + + /* rec encoding map */ + mc = 0; + BLOCK_SIZE(map) = 0; + if (h->BF_codec) { + if (-1 == h->BF_codec->store(h->BF_codec, map, "BF", fd->version)) + return NULL; + mc++; + } + if (h->CF_codec) { + if (-1 == h->CF_codec->store(h->CF_codec, map, "CF", fd->version)) + return NULL; + mc++; + } + if (h->RL_codec) { + if (-1 == h->RL_codec->store(h->RL_codec, map, "RL", fd->version)) + return NULL; + mc++; + } + if (h->AP_codec) { + if (-1 == h->AP_codec->store(h->AP_codec, map, "AP", fd->version)) + return NULL; + mc++; + } + if (h->RG_codec) { + if (-1 == h->RG_codec->store(h->RG_codec, map, "RG", fd->version)) + return NULL; + mc++; + } + if (h->MF_codec) { + if (-1 == h->MF_codec->store(h->MF_codec, map, "MF", fd->version)) + return NULL; + mc++; + } + if (h->NS_codec) { + if (-1 == h->NS_codec->store(h->NS_codec, map, "NS", fd->version)) + return NULL; + mc++; + } + if (h->NP_codec) { + if (-1 == h->NP_codec->store(h->NP_codec, map, "NP", fd->version)) + return NULL; + mc++; + } + if (h->TS_codec) { + if (-1 == h->TS_codec->store(h->TS_codec, map, "TS", fd->version)) + return NULL; + mc++; + } + if (h->NF_codec) { + if (-1 == h->NF_codec->store(h->NF_codec, map, "NF", fd->version)) + return NULL; + mc++; + } + if (h->TC_codec) { + if (-1 == h->TC_codec->store(h->TC_codec, map, "TC", fd->version)) + return NULL; + mc++; + } + if (h->TN_codec) { + if (-1 == h->TN_codec->store(h->TN_codec, map, "TN", fd->version)) + return NULL; + mc++; + } + if (h->TL_codec) { + if (-1 == h->TL_codec->store(h->TL_codec, map, "TL", fd->version)) + return NULL; + mc++; + } + if (h->FN_codec) { + if (-1 == h->FN_codec->store(h->FN_codec, map, "FN", fd->version)) + return NULL; + mc++; + } + if (h->FC_codec) { + if (-1 == h->FC_codec->store(h->FC_codec, map, "FC", fd->version)) + return NULL; + mc++; + } + if (h->FP_codec) { + if (-1 == h->FP_codec->store(h->FP_codec, map, "FP", fd->version)) + return NULL; + mc++; + } + if (h->BS_codec) { + if (-1 == h->BS_codec->store(h->BS_codec, map, "BS", fd->version)) + return NULL; + mc++; + } + if (h->IN_codec) { + if (-1 == h->IN_codec->store(h->IN_codec, map, "IN", fd->version)) + return NULL; + mc++; + } + if (h->DL_codec) { + if (-1 == h->DL_codec->store(h->DL_codec, map, "DL", fd->version)) + return NULL; + mc++; + } + if (h->BA_codec) { + if (-1 == h->BA_codec->store(h->BA_codec, map, "BA", fd->version)) + return NULL; + mc++; + } + if (h->MQ_codec) { + if (-1 == h->MQ_codec->store(h->MQ_codec, map, "MQ", fd->version)) + return NULL; + mc++; + } + if (h->RN_codec) { + if (-1 == h->RN_codec->store(h->RN_codec, map, "RN", fd->version)) + return NULL; + mc++; + } + if (h->QS_codec) { + if (-1 == h->QS_codec->store(h->QS_codec, map, "QS", fd->version)) + return NULL; + mc++; + } + if (h->Qs_codec) { + if (-1 == h->Qs_codec->store(h->Qs_codec, map, "Qs", fd->version)) + return NULL; + mc++; + } + if (h->RI_codec) { + if (-1 == h->RI_codec->store(h->RI_codec, map, "RI", fd->version)) + return NULL; + mc++; + } + if (fd->version != CRAM_1_VERS) { + if (h->SC_codec) { + if (-1 == h->SC_codec->store(h->SC_codec, map, "SC", fd->version)) + return NULL; + mc++; + } + if (h->RS_codec) { + if (-1 == h->RS_codec->store(h->RS_codec, map, "RS", fd->version)) + return NULL; + mc++; + } + if (h->PD_codec) { + if (-1 == h->PD_codec->store(h->PD_codec, map, "PD", fd->version)) + return NULL; + mc++; + } + if (h->HC_codec) { + if (-1 == h->HC_codec->store(h->HC_codec, map, "HC", fd->version)) + return NULL; + mc++; + } + } + if (h->TM_codec) { + if (-1 == h->TM_codec->store(h->TM_codec, map, "TM", fd->version)) + return NULL; + mc++; + } + if (h->TV_codec) { + if (-1 == h->TV_codec->store(h->TV_codec, map, "TV", fd->version)) + return NULL; + mc++; + } + itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + itf8_put_blk(cb, mc); + BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); + + /* tag encoding map */ +#if 0 + mp = map; mc = 0; + if (h->tag_encoding_map) { + HashItem *hi; + HashIter *iter = HashTableIterCreate(); + if (!iter) + return NULL; + + while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { + cram_map *m = hi->data.p; + int sz; + + mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); + if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) + return NULL; + mp += sz; + mc++; + } + + HashTableIterDestroy(iter); + } +#else + mc = 0; + BLOCK_SIZE(map) = 0; + if (c->tags_used) { + khint_t k; + + for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { + if (!kh_exist(c->tags_used, k)) + continue; + + mc++; + itf8_put_blk(map, kh_key(c->tags_used, k)); + + // use block content id 4 + switch(kh_key(c->tags_used, k) & 0xff) { + case 'Z': case 'H': + // string as byte_array_stop + if (fd->version == CRAM_1_VERS) { + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\005" // len + "\t" // stop-byte is also SAM separator + CRAM_EXT_TAG_S "\000\000\000", + 7); + } else { + BLOCK_APPEND(map, + "\005" // BYTE_ARRAY_STOP + "\002" // len + "\t" // stop-byte is also SAM separator + CRAM_EXT_TAG_S, + 4); + } + break; + + case 'A': case 'c': case 'C': + // byte array len, 1 byte + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\011" // length + "\003" // HUFFMAN (len) + "\004" // huffman-len + "\001" // 1 symbol + "\001" // symbol=1 byte value + "\001" // 1 length + "\000" // length=0 + "\001" // EXTERNAL (val) + "\001" // external-len + CRAM_EXT_TAG_S,// content-id + 11); + break; + + case 's': case 'S': + // byte array len, 2 byte + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\011" // length + "\003" // HUFFMAN (len) + "\004" // huffman-len + "\001" // 1 symbol + "\002" // symbol=2 byte value + "\001" // 1 length + "\000" // length=0 + "\001" // EXTERNAL (val) + "\001" // external-len + CRAM_EXT_TAG_S,// content-id + 11); + break; + + case 'i': case 'I': case 'f': + // byte array len, 4 byte + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\011" // length + "\003" // HUFFMAN (len) + "\004" // huffman-len + "\001" // 1 symbol + "\004" // symbol=4 byte value + "\001" // 1 length + "\000" // length=0 + "\001" // EXTERNAL (val) + "\001" // external-len + CRAM_EXT_TAG_S,// content-id + 11); + break; + + case 'B': + // Byte array of variable size, but we generate our tag + // byte stream at the wrong stage (during reading and not + // after slice header construction). So we use + // BYTE_ARRAY_LEN with the length codec being external + // too. + BLOCK_APPEND(map, + "\004" // BYTE_ARRAY_LEN + "\006" // length + "\001" // EXTERNAL (len) + "\001" // external-len + "\004" // content-id + "\001" // EXTERNAL (val) + "\001" // external-len + CRAM_EXT_TAG_S,// content-id + 8); + break; + + default: + fprintf(stderr, "Unsupported SAM aux type '%c'\n", + kh_key(c->tags_used, k) & 0xff); + } + //mp += m->codec->store(m->codec, mp, NULL, fd->version); + } + } +#endif + itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + itf8_put_blk(cb, mc); + BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); + + if (fd->verbose) + fprintf(stderr, "Wrote compression block header in %d bytes\n", + (int)BLOCK_SIZE(cb)); + + BLOCK_UPLEN(cb); + + cram_free_block(map); + + return cb; +} + + +/* + * Encodes a slice compression header. + * + * Returns cram_block on success + * NULL on failure + */ +cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { + char *buf; + char *cp; + cram_block *b = cram_new_block(MAPPED_SLICE, 0); + int j; + + if (!b) + return NULL; + + if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) { + cram_free_block(b); + return NULL; + } + + cp += itf8_put(cp, s->hdr->ref_seq_id); + cp += itf8_put(cp, s->hdr->ref_seq_start); + cp += itf8_put(cp, s->hdr->ref_seq_span); + cp += itf8_put(cp, s->hdr->num_records); + if (fd->version != CRAM_1_VERS) + cp += itf8_put(cp, s->hdr->record_counter); + cp += itf8_put(cp, s->hdr->num_blocks); + cp += itf8_put(cp, s->hdr->num_content_ids); + for (j = 0; j < s->hdr->num_content_ids; j++) { + cp += itf8_put(cp, s->hdr->block_content_ids[j]); + } + if (s->hdr->content_type == MAPPED_SLICE) + cp += itf8_put(cp, s->hdr->ref_base_id); + + if (fd->version != CRAM_1_VERS) { + memcpy(cp, s->hdr->md5, 16); cp += 16; + } + + assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); + + b->data = (unsigned char *)buf; + b->comp_size = b->uncomp_size = cp-buf; + + return b; +} + + +/* + * Encodes a single slice from a container + * FIXME: break into smaller components. + * + * Returns 0 on success + * -1 on failure + */ +static int cram_encode_slice(cram_fd *fd, cram_container *c, + cram_block_compression_hdr *h, cram_slice *s) { + int rec, r = 0, last_pos; + cram_block *core; + int nblk, embed_ref; + + embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; + + /* + * Slice external blocks: + * ID 0 => base calls (insertions, soft-clip) + * ID 1 => qualities + * ID 2 => names + * ID 3 => TS (insert size), NP (next frag) + * ID 4 => tag values + * ID 5 => BA, ifdef BA_external + * ID 6 => tag IDs (TN), ifdef TN_external and CRAM_1_VERS + * ID 7 => TD tag dictionary, if !CRAM_1_VERS + */ + + /* Create cram slice header, num_blocks etc */ + s->hdr->ref_base_id = embed_ref ? CRAM_EXT_REF : -1; + s->hdr->record_counter = c->num_records + c->record_counter; + c->num_records += s->hdr->num_records; + nblk = (fd->version == CRAM_1_VERS) ? 5 : 6; +#ifdef BA_external + nblk++; +#endif +#ifdef TN_external + if (fd->version == CRAM_1_VERS) { + nblk++; + } +#endif + if (embed_ref) + nblk++; + + s->hdr->num_content_ids = nblk; + s->hdr->num_blocks = s->hdr->num_content_ids+1; + s->block = calloc(s->hdr->num_blocks, sizeof(s->block[0])); + s->hdr->block_content_ids = malloc(s->hdr->num_content_ids * + sizeof(int32_t)); + if (!s->block || !s->hdr->block_content_ids) + return -1; + s->hdr->block_content_ids[0] = 0; // core + s->hdr->block_content_ids[1] = CRAM_EXT_QUAL; + s->hdr->block_content_ids[2] = CRAM_EXT_NAME; + s->hdr->block_content_ids[3] = CRAM_EXT_TS_NP; + s->hdr->block_content_ids[4] = CRAM_EXT_TAG; + s->hdr->block_content_ids[5] = CRAM_EXT_SC; + nblk = (fd->version == CRAM_1_VERS) ? 5 : 6; +#ifdef BA_external + s->hdr->block_content_ids[(s->ba_id = ++nblk)-1] = CRAM_EXT_BA; +#endif +#ifdef TN_external + if (fd->version == CRAM_1_VERS) { + s->hdr->block_content_ids[(s->tn_id = ++nblk)-1] = CRAM_EXT_TN; + } +#endif + if (embed_ref) + s->hdr->block_content_ids[(s->ref_id = ++nblk)-1] = CRAM_EXT_REF; + + if (!(s->block[0] = cram_new_block(CORE, 0))) return -1; + if (!(s->block[1] = cram_new_block(EXTERNAL, CRAM_EXT_IN))) return -1; + if (!(s->block[2] = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) return -1; + if (!(s->block[3] = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) return -1; + if (!(s->block[4] = cram_new_block(EXTERNAL, CRAM_EXT_TS_NP))) return -1; + if (!(s->block[5] = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) return -1; + if (fd->version != CRAM_1_VERS) { + if (!(s->block[6] = cram_new_block(EXTERNAL, CRAM_EXT_SC))) + return -1; + } +#ifdef BA_external + if (!(s->block[s->ba_id] = cram_new_block(EXTERNAL, CRAM_EXT_BA))) + return -1; +#endif +#ifdef TN_external + if (fd->version == CRAM_1_VERS) { + if (!(s->block[s->tn_id] = cram_new_block(EXTERNAL, CRAM_EXT_TN))) + return -1; + } +#endif + if (embed_ref) { + if (!(s->block[s->ref_id] = cram_new_block(EXTERNAL, CRAM_EXT_REF))) + return -1; + BLOCK_APPEND(s->block[s->ref_id], + c->ref + c->first_base - c->ref_start, + c->last_base - c->first_base + 1); + } + + core = s->block[0]; + + /* Create a formal method for stealing from dstrings! */ + s->block[4]->data = calloc(10, s->hdr->num_records); // NP TS + if (!s->block[4]->data) + return -1; + s->block[4]->comp_size = s->block[4]->uncomp_size = 0; + +#ifdef BA_external + s->block[s->ba_id]->data = calloc(1, s->BA_len); + if (!s->block[s->ba_id]->data) + return -1; + s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size = 0; +#endif + + /* Generate core block */ + if (!(s->hdr_block = cram_encode_slice_header(fd, s))) + return -1; + + last_pos = s->hdr->ref_seq_start; + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + int32_t i32; + unsigned char uc; + + //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); + + //printf("BF=0x%x\n", cr->flags); + // bf = cram_flag_swap[cr->flags]; + i32 = fd->cram_flag_swap[cr->flags & 0xfff]; + r |= h->BF_codec->encode(s, h->BF_codec, core, (char *)&i32, 1); + + i32 = cr->cram_flags; + r |= h->CF_codec->encode(s, h->CF_codec, core, + (char *)&i32, 1); + + if (fd->version != CRAM_1_VERS) + r |= h->RI_codec->encode(s, h->RI_codec, core, + (char *)&cr->ref_id, 1); + + r |= h->RL_codec->encode(s, h->RL_codec, core, + (char *)&cr->len, 1); + + if (c->pos_sorted) { + i32 = cr->apos - last_pos; + r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1); + last_pos = cr->apos; + } else { + i32 = cr->apos; + r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1); + } + + r |= h->RG_codec->encode(s, h->RG_codec, core, + (char *)&cr->rg, 1); + + if (c->comp_hdr->read_names_included) { + // RN codec: Already stored in block[3]. + } + + if (cr->cram_flags & CRAM_FLAG_DETACHED) { + i32 = cr->mate_flags; + r |= h->MF_codec->encode(s, h->MF_codec, core, (char *)&i32, 1); + + if (!c->comp_hdr->read_names_included) { + // RN codec: Already stored in block[3]. + } + +#ifndef NS_external + r |= h->NS_codec->encode(s, h->NS_codec, core, + (char *)&cr->mate_ref_id, 1); +#else + s->block[4]->uncomp_size += + itf8_put(&s->block[4]->data[s->block[4]->uncomp_size], + cr->mate_ref_id); +#endif + +#ifndef TS_external + r |= h->NP_codec->encode(s, h->NP_codec, core, + (char *)&cr->mate_pos, 1); + + r |= h->TS_codec->encode(s, h->TS_codec, core, + (char *)&cr->tlen, 1); +#else + s->block[4]->uncomp_size += + itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size], + cr->mate_pos); + s->block[4]->uncomp_size += + itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size], + cr->tlen); +#endif + } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { + r |= h->NF_codec->encode(s, h->NF_codec, core, + (char *)&cr->mate_line, 1); + } + + /* Aux tags */ + if (fd->version == CRAM_1_VERS) { + uc = cr->ntags; + r |= h->TC_codec->encode(s, h->TC_codec, core, (char *)&uc, 1); +#ifndef TN_external + { + int j; + for (j = 0; j < cr->ntags; j++) { + uint32_t i32 = s->TN[cr->TN_idx + j]; // id + r |= h->TN_codec->encode(s, h->TN_codec, core, + (char *)&i32, 1); + } + } +#endif + } else { + r |= h->TL_codec->encode(s, h->TL_codec, core, (char *)&cr->TL, 1); + } + + // qual + // QS codec : Already stored in block[2]. + + // features (diffs) + if (!(cr->flags & BAM_FUNMAP)) { + int prev_pos = 0, j; + + r |= h->FN_codec->encode(s, h->FN_codec, core, + (char *)&cr->nfeature, 1); + for (j = 0; j < cr->nfeature; j++) { + cram_feature *f = &s->features[cr->feature + j]; + + uc = f->X.code; + r |= h->FC_codec->encode(s, h->FC_codec, core, + (char *)&uc, 1); + i32 = f->X.pos - prev_pos; + r |= h->FP_codec->encode(s, h->FP_codec, core, + (char *)&i32, 1); + prev_pos = f->X.pos; + + switch(f->X.code) { + //char *seq; + + case 'X': + //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); + + uc = f->X.base; + r |= h->BS_codec->encode(s, h->BS_codec, core, + (char *)&uc, 1); + break; + case 'S': + //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; + //r |= h->SC_codec->encode(s, h->SC_codec, core, + // seq, f->S.len); + break; + case 'I': + //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; + //r |= h->IN_codec->encode(s, h->IN_codec, core, + // seq, f->S.len); + break; + case 'i': + uc = f->i.base; +#ifdef BA_external + s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc; +#else + r |= h->BA_codec->encode(s, h->BA_codec, core, + (char *)&uc, 1); +#endif + //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; + //r |= h->IN_codec->encode(s, h->IN_codec, core, + // seq, 1); + break; + case 'D': + i32 = f->D.len; + r |= h->DL_codec->encode(s, h->DL_codec, core, + (char *)&i32, 1); + break; + + case 'B': +// // Used when we try to store a non ACGTN base or an N +// // that aligns against a non ACGTN reference + + uc = f->B.base; +#ifdef BA_external + s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc; +#else + r |= h->BA_codec->encode(s, h->BA_codec, core, + (char *)&uc, 1); +#endif + +// Already added +// uc = f->B.qual; +// r |= h->QS_codec->encode(s, h->QS_codec, core, +// (char *)&uc, 1); + break; + + case 'Q': +// Already added +// uc = f->B.qual; +// r |= h->QS_codec->encode(s, h->QS_codec, core, +// (char *)&uc, 1); + break; + + case 'N': + i32 = f->N.len; + r |= h->RS_codec->encode(s, h->RS_codec, core, + (char *)&i32, 1); + break; + + case 'P': + i32 = f->P.len; + r |= h->PD_codec->encode(s, h->PD_codec, core, + (char *)&i32, 1); + break; + + case 'H': + i32 = f->H.len; + r |= h->HC_codec->encode(s, h->HC_codec, core, + (char *)&i32, 1); + break; + + + default: + fprintf(stderr, "unhandled feature code %c\n", + f->X.code); + return -1; + } + } + + r |= h->MQ_codec->encode(s, h->MQ_codec, core, + (char *)&cr->mqual, 1); + } else { + char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; +#ifdef BA_external + memcpy(&s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size], + seq, cr->len); + s->block[s->ba_id]->uncomp_size += cr->len; +#else + r |= h->BA_codec->encode(s, h->BA_codec, core, seq, cr->len); +#endif + } + + if (r) + return -1; + } + s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); + s->block[0]->comp_size = s->block[0]->uncomp_size; + + // FIXME: we should avoid creating these in the first place and just + // point them to s->base_blk et al. + cram_free_block(s->block[1]); + cram_free_block(s->block[2]); + cram_free_block(s->block[3]); + cram_free_block(s->block[5]); + if (fd->version != CRAM_1_VERS) { + cram_free_block(s->block[6]); + BLOCK_UPLEN(s->soft_blk); + s->block[6] = s->soft_blk; + s->soft_blk = NULL; + } + BLOCK_UPLEN(s->base_blk); s->block[1] = s->base_blk; s->base_blk = NULL; + BLOCK_UPLEN(s->qual_blk); s->block[2] = s->qual_blk; s->qual_blk = NULL; + BLOCK_UPLEN(s->name_blk); s->block[3] = s->name_blk; s->name_blk = NULL; + BLOCK_UPLEN(s->aux_blk); s->block[5] = s->aux_blk; s->aux_blk = NULL; + +#ifdef TN_external + if (fd->version == CRAM_1_VERS) { + cram_free_block(s->block[s->tn_id]); + BLOCK_UPLEN(s->tn_blk); s->block[s->tn_id] = s->tn_blk; + s->tn_blk = NULL; + } +#endif + + s->block[4]->comp_size = s->block[4]->uncomp_size; + +#ifdef BA_external + s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size; +#endif + + /* Compress the CORE Block too, with minimal zlib level */ + if (fd->level > 5) + cram_compress_block(fd, s->block[0], NULL, 1, Z_CRAM_STRAT, -1, -1); + +#define USE_METRICS + +#ifdef USE_METRICS +# define LEVEL2 1 +# define STRAT2 Z_RLE +#else +# define LEVEL2 -1 +# define STRAT2 -1 +#endif + + /* Compress the other blocks */ + if (cram_compress_block(fd, s->block[1], NULL, //IN (seq) + fd->level, Z_CRAM_STRAT, + -1, -1)) + return -1; + + if (fd->level == 0) { + /* Do nothing */ + } else if (fd->level == 1) { + if (cram_compress_block(fd, s->block[2], fd->m[1], //qual + 1, Z_RLE, -1, -1)) + return -1; + if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags + 1, Z_RLE, -1, -1)) + return -1; + } else if (fd->level < 3) { + if (cram_compress_block(fd, s->block[2], fd->m[1], //qual + 1, Z_RLE, + 1, Z_HUFFMAN_ONLY)) + return -1; + if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags + 1, Z_RLE, + 1, Z_HUFFMAN_ONLY)) + return -1; + } else { + if (cram_compress_block(fd, s->block[2], fd->m[1], //qual + fd->level, Z_CRAM_STRAT, + LEVEL2, STRAT2)) + return -1; + if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags + fd->level, Z_CRAM_STRAT, + LEVEL2, STRAT2)) + return -1; + } + if (cram_compress_block(fd, s->block[3], NULL, //Name + fd->level, Z_CRAM_STRAT, + -1, -1)) + return -1; + if (cram_compress_block(fd, s->block[4], NULL, //TS, NP + fd->level, Z_CRAM_STRAT, + -1, -1)) + return -1; + if (fd->version != CRAM_1_VERS) { + if (cram_compress_block(fd, s->block[6], NULL, //SC (seq) + fd->level, Z_CRAM_STRAT, + -1, -1)) + return -1; + } +#ifdef BA_external + if (cram_compress_block(fd, s->block[s->ba_id], NULL, + fd->level, Z_CRAM_STRAT, -1, -1)) + return -1; +#endif +#ifdef TN_external + if (fd->version == CRAM_1_VERS) { + if (cram_compress_block(fd, s->block[s->tn_id], NULL, + fd->level, Z_DEFAULT_STRATEGY, -1, -1)) + return -1; + } +#endif + if (embed_ref) { + BLOCK_UPLEN(s->block[s->ref_id]); + if (cram_compress_block(fd, s->block[s->ref_id], NULL, + fd->level, Z_DEFAULT_STRATEGY, -1, -1)) + return -1; + } + + return r ? -1 : 0; +} + +/* + * Encodes all slices in a container into blocks. + * Returns 0 on success + * -1 on failure + */ +int cram_encode_container(cram_fd *fd, cram_container *c) { + int i, j, slice_offset; + cram_block_compression_hdr *h = c->comp_hdr; + cram_block *c_hdr; + int multi_ref = 0; + int r1, r2, sn, nref; + spare_bams *spares; + + /* Cache references up-front if we have unsorted access patterns */ + pthread_mutex_lock(&fd->ref_lock); + nref = fd->refs->nref; + pthread_mutex_unlock(&fd->ref_lock); + + if (c->refs_used) { + for (i = 0; i < nref; i++) { + if (c->refs_used[i]) { + cram_get_ref(fd, i, 1, 0); + } + } + } + + /* Fetch reference sequence */ + if (!fd->no_ref) { + bam_seq_t *b = c->bams[0]; + char *ref; + + ref = cram_get_ref(fd, bam_ref(b), 1, 0); + if (!ref && bam_ref(b) >= 0) { + fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b)); + return -1; + } + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_seq_id = c->ref_id; + c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; + c->ref_start = 1; + c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + } else { + c->ref_seq_id = c->ref_id; // FIXME remove one var! + } + } else { + c->ref_seq_id = c->ref_id; // FIXME remove one var! + } + + /* Turn bams into cram_records and gather basic stats */ + for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { + cram_slice *s = c->slices[sn]; + int first_base = INT_MAX, last_base = INT_MIN; + + assert(sn < c->curr_slice); + + /* FIXME: we could create our slice objects here too instead of + * in cram_put_bam_seq. It's more natural here and also this is + * bit is threaded so it's less work in the main thread. + */ + + for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) { + cram_record *cr = &s->crecs[r2]; + bam_seq_t *b = c->bams[r1]; + + /* If multi-ref we need to cope with changing reference per seq */ + if (c->multi_seq && !fd->no_ref) { + if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { + if (c->ref_seq_id >= 0) + cram_ref_decr(fd->refs, c->ref_seq_id); + + if (!cram_get_ref(fd, bam_ref(b), 1, 0)) { + fprintf(stderr, "Failed to load reference #%d\n", + bam_ref(b)); + return -1; + } + + c->ref_seq_id = bam_ref(b); // overwritten later by -2 + assert(fd->refs->ref_id[c->ref_seq_id]->seq); + c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; + c->ref_start = 1; + c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + } + } + + process_one_read(fd, c, s, cr, b, r2); + + if (first_base > cr->apos) + first_base = cr->apos; + + if (last_base < cr->aend) + last_base = cr->aend; + } + + if (c->multi_seq) { + s->hdr->ref_seq_id = -2; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; + } else { + s->hdr->ref_seq_id = c->ref_id; + s->hdr->ref_seq_start = first_base; + s->hdr->ref_seq_span = last_base - first_base + 1; + } + s->hdr->num_records = r2; + } + + /* Link our bams[] array onto the spare bam list for reuse */ + spares = malloc(sizeof(*spares)); + pthread_mutex_lock(&fd->bam_list_lock); + spares->bams = c->bams; + spares->next = fd->bl; + fd->bl = spares; + pthread_mutex_unlock(&fd->bam_list_lock); + c->bams = NULL; + + /* Detect if a multi-seq container */ + cram_stats_encoding(fd, c->RI_stats); + multi_ref = c->RI_stats->nvals > 1; + + if (multi_ref) { + if (fd->verbose) + fprintf(stderr, "Multi-ref container\n"); + c->ref_seq_id = -2; + c->ref_seq_start = 0; + c->ref_seq_span = 0; + } + + + /* Compute MD5s */ + for (i = 0; i < c->curr_slice; i++) { + cram_slice *s = c->slices[i]; + + if (fd->version != CRAM_1_VERS) { + if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { + MD5_CTX md5; + MD5_Init(&md5); + MD5_Update(&md5, + c->ref + s->hdr->ref_seq_start - c->ref_start, + s->hdr->ref_seq_span); + MD5_Final(s->hdr->md5, &md5); + } else { + memset(s->hdr->md5, 0, 16); + } + } + } + + c->num_records = 0; + c->num_blocks = 0; + c->length = 0; + + //fprintf(stderr, "=== BF ===\n"); + h->BF_codec = cram_encoder_init(cram_stats_encoding(fd, c->BF_stats), + c->BF_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== CF ===\n"); + h->CF_codec = cram_encoder_init(cram_stats_encoding(fd, c->CF_stats), + c->CF_stats, E_INT, NULL, + fd->version); +// fprintf(stderr, "=== RN ===\n"); +// h->RN_codec = cram_encoder_init(cram_stats_encoding(fd, c->RN_stats), +// c->RN_stats, E_BYTE_ARRAY, NULL, +// fd->version); + + //fprintf(stderr, "=== AP ===\n"); + if (c->pos_sorted) { + h->AP_codec = cram_encoder_init(cram_stats_encoding(fd, c->AP_stats), + c->AP_stats, E_INT, NULL, + fd->version); + } else { + int p[2] = {0, c->max_apos}; + h->AP_codec = cram_encoder_init(E_BETA, NULL, E_INT, p, fd->version); + } + + //fprintf(stderr, "=== RG ===\n"); + h->RG_codec = cram_encoder_init(cram_stats_encoding(fd, c->RG_stats), + c->RG_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== MQ ===\n"); + h->MQ_codec = cram_encoder_init(cram_stats_encoding(fd, c->MQ_stats), + c->MQ_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== NS ===\n"); +#ifdef NS_external + h->NS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, + (void *)CRAM_EXT_NS, + fd->version); +#else + h->NS_codec = cram_encoder_init(cram_stats_encoding(fd, c->NS_stats), + c->NS_stats, E_INT, NULL, + fd->version); +#endif + + //fprintf(stderr, "=== MF ===\n"); + h->MF_codec = cram_encoder_init(cram_stats_encoding(fd, c->MF_stats), + c->MF_stats, E_INT, NULL, + fd->version); + +#ifdef TS_external + h->TS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, + (void *)CRAM_EXT_TS_NP, + fd->version); + h->NP_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, + (void *)CRAM_EXT_TS_NP, + fd->version); +#else + //fprintf(stderr, "=== TS ===\n"); + h->TS_codec = cram_encoder_init(cram_stats_encoding(fd, c->TS_stats), + c->TS_stats, E_INT, NULL, + fd->version); + //fprintf(stderr, "=== NP ===\n"); + h->NP_codec = cram_encoder_init(cram_stats_encoding(fd, c->NP_stats), + c->NP_stats, E_INT, NULL, + fd->version); +#endif + + //fprintf(stderr, "=== NF ===\n"); + h->NF_codec = cram_encoder_init(cram_stats_encoding(fd, c->NF_stats), + c->NF_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== RL ===\n"); + h->RL_codec = cram_encoder_init(cram_stats_encoding(fd, c->RL_stats), + c->RL_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== FN ===\n"); + h->FN_codec = cram_encoder_init(cram_stats_encoding(fd, c->FN_stats), + c->FN_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== FC ===\n"); + h->FC_codec = cram_encoder_init(cram_stats_encoding(fd, c->FC_stats), + c->FC_stats, E_BYTE, NULL, + fd->version); + + //fprintf(stderr, "=== FP ===\n"); + h->FP_codec = cram_encoder_init(cram_stats_encoding(fd, c->FP_stats), + c->FP_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== DL ===\n"); + h->DL_codec = cram_encoder_init(cram_stats_encoding(fd, c->DL_stats), + c->DL_stats, E_INT, NULL, + fd->version); + +#ifdef BA_external + h->BA_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, + (void *)CRAM_EXT_BA, + fd->version); +#else + //fprintf(stderr, "=== BA ===\n"); + h->BA_codec = cram_encoder_init(cram_stats_encoding(fd, c->BA_stats), + c->BA_stats, E_BYTE, NULL, + fd->version); +#endif + + //fprintf(stderr, "=== BS ===\n"); + h->BS_codec = cram_encoder_init(cram_stats_encoding(fd, c->BS_stats), + c->BS_stats, E_BYTE, NULL, + fd->version); + + if (fd->version == CRAM_1_VERS) { + h->TL_codec = NULL; + h->RI_codec = NULL; + h->RS_codec = NULL; + h->PD_codec = NULL; + h->HC_codec = NULL; + h->SC_codec = NULL; + + //fprintf(stderr, "=== TC ===\n"); + h->TC_codec = cram_encoder_init(cram_stats_encoding(fd, c->TC_stats), + c->TC_stats, E_BYTE, NULL, + fd->version); + + //fprintf(stderr, "=== TN ===\n"); +#ifdef TN_external + h->TN_codec = cram_encoder_init(E_EXTERNAL, NULL, E_INT, + (void *)CRAM_EXT_TN, + fd->version); +#else + h->TN_codec = cram_encoder_init(cram_stats_encoding(fd, c->TN_stats), + c->TN_stats, E_INT, NULL, + fd->version); +#endif + } else { + int i2[2] = {0, CRAM_EXT_SC}; + + h->TC_codec = NULL; + h->TN_codec = NULL; + + //fprintf(stderr, "=== TL ===\n"); + h->TL_codec = cram_encoder_init(cram_stats_encoding(fd, c->TL_stats), + c->TL_stats, E_INT, NULL, + fd->version); + + + //fprintf(stderr, "=== RI ===\n"); + h->RI_codec = cram_encoder_init(cram_stats_encoding(fd, c->RI_stats), + c->RI_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== RS ===\n"); + h->RS_codec = cram_encoder_init(cram_stats_encoding(fd, c->RS_stats), + c->RS_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== PD ===\n"); + h->PD_codec = cram_encoder_init(cram_stats_encoding(fd, c->PD_stats), + c->PD_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== HC ===\n"); + h->HC_codec = cram_encoder_init(cram_stats_encoding(fd, c->HC_stats), + c->HC_stats, E_INT, NULL, + fd->version); + + //fprintf(stderr, "=== SC ===\n"); + h->SC_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); + } + + //fprintf(stderr, "=== IN ===\n"); + { + int i2[2] = {0, CRAM_EXT_IN}; + h->IN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); + } + + { + //int i2[2] = {0, 1}; + //h->QS_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, (void *)i2, + // fd->version); + h->QS_codec = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, + (void *)CRAM_EXT_QUAL, + fd->version); + } + { + int i2[2] = {0, CRAM_EXT_NAME}; + h->RN_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, + E_BYTE_ARRAY, (void *)i2, + fd->version); + } + + + /* Encode slices */ + for (i = 0; i < c->curr_slice; i++) { + if (fd->verbose) + fprintf(stderr, "Encode slice %d\n", i); + if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) + return -1; + } + + /* Create compression header */ + { + h->ref_seq_id = c->ref_seq_id; + h->ref_seq_start = c->ref_seq_start; + h->ref_seq_span = c->ref_seq_span; + h->num_records = c->num_records; + + h->mapped_qs_included = 0; // fixme + h->unmapped_qs_included = 0; // fixme + // h->... fixme + memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); + + if (!(c_hdr = cram_encode_compression_header(fd, c, h))) + return -1; + } + + /* Compute landmarks */ + /* Fill out slice landmarks */ + c->num_landmarks = c->curr_slice; + c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark)); + if (!c->landmark) + return -1; + + /* + * Slice offset starts after the first block, so we need to simulate + * writing it to work out the correct offset + */ + { + slice_offset = c_hdr->method == RAW + ? c_hdr->uncomp_size + : c_hdr->comp_size; + slice_offset += 2 + + itf8_size(c_hdr->content_id) + + itf8_size(c_hdr->comp_size) + + itf8_size(c_hdr->uncomp_size); + } + + c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; + c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; + c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + for (i = 0; i < c->curr_slice; i++) { + cram_slice *s = c->slices[i]; + + c->num_blocks += s->hdr->num_blocks + 2; + c->landmark[i] = slice_offset; + + if (s->hdr->ref_seq_start + s->hdr->ref_seq_span > + c->ref_seq_start + c->ref_seq_span) { + c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span + - c->ref_seq_start; + } + + slice_offset += s->hdr_block->method == RAW + ? s->hdr_block->uncomp_size + : s->hdr_block->comp_size; + + slice_offset += 2 + + itf8_size(s->hdr_block->content_id) + + itf8_size(s->hdr_block->comp_size) + + itf8_size(s->hdr_block->uncomp_size); + + for (j = 0; j < s->hdr->num_blocks; j++) { + slice_offset += 2 + + itf8_size(s->block[j]->content_id) + + itf8_size(s->block[j]->comp_size) + + itf8_size(s->block[j]->uncomp_size); + + slice_offset += s->block[j]->method == RAW + ? s->block[j]->uncomp_size + : s->block[j]->comp_size; + } + } + c->length += slice_offset; // just past the final slice + + c->comp_hdr_block = c_hdr; + + if (c->ref_seq_id >= 0) { + cram_ref_decr(fd->refs, c->ref_seq_id); + } + + /* Cache references up-front if we have unsorted access patterns */ + if (c->refs_used) { + for (i = 0; i < fd->refs->nref; i++) { + if (c->refs_used[i]) + cram_ref_decr(fd->refs, i); + } + } + + return 0; +} + + +/* + * Adds a feature code to a read within a slice. For purposes of minimising + * memory allocations and fragmentation we have one array of features for all + * reads within the slice. We return the index into this array for this new + * feature. + * + * Returns feature index on success + * -1 on failure. + */ +static int cram_add_feature(cram_container *c, cram_slice *s, + cram_record *r, cram_feature *f) { + if (s->nfeatures >= s->afeatures) { + s->afeatures = s->afeatures ? s->afeatures*2 : 1024; + s->features = realloc(s->features, s->afeatures*sizeof(*s->features)); + if (!s->features) + return -1; + } + + if (!r->nfeature++) { + r->feature = s->nfeatures; + cram_stats_add(c->FP_stats, f->X.pos); + } else { + cram_stats_add(c->FP_stats, + f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); + } + cram_stats_add(c->FC_stats, f->X.code); + + s->features[s->nfeatures++] = *f; + + return 0; +} + +static int cram_add_substitution(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *r, + int pos, char base, char qual, char ref) { + cram_feature f; + + // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN + if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) { + f.X.pos = pos+1; + f.X.code = 'X'; + f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; + cram_stats_add(c->BS_stats, f.X.base); + } else { + f.B.pos = pos+1; + f.B.code = 'B'; + f.B.base = base; + f.B.qual = qual; + cram_stats_add(c->BA_stats, f.B.base); + cram_stats_add(c->QS_stats, f.B.qual); + BLOCK_APPEND_CHAR(s->qual_blk, qual); + } + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_base(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *r, + int pos, char base, char qual) { + cram_feature f; + f.B.pos = pos+1; + f.B.code = 'B'; + f.B.base = base; + f.B.qual = qual; +#ifdef BA_external + s->BA_len++; +#else + cram_stats_add(c->BA_stats, base); +#endif + cram_stats_add(c->QS_stats, qual); + BLOCK_APPEND_CHAR(s->qual_blk, qual); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_quality(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *r, + int pos, char qual) { + cram_feature f; + f.Q.pos = pos+1; + f.Q.code = 'Q'; + f.Q.qual = qual; + cram_stats_add(c->QS_stats, qual); + BLOCK_APPEND_CHAR(s->qual_blk, qual); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + f.D.pos = pos+1; + f.D.code = 'D'; + f.D.len = len; + cram_stats_add(c->DL_stats, len); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base, int version) { + cram_feature f; + f.S.pos = pos+1; + f.S.code = 'S'; + f.S.len = len; + if (version == CRAM_1_VERS) { + f.S.seq_idx = BLOCK_SIZE(s->base_blk); + BLOCK_APPEND(s->base_blk, base, len); + BLOCK_APPEND_CHAR(s->base_blk, '\0'); + } else { + f.S.seq_idx = BLOCK_SIZE(s->soft_blk); + if (base) { + BLOCK_APPEND(s->soft_blk, base, len); + } else { + int i; + for (i = 0; i < len; i++) + BLOCK_APPEND_CHAR(s->soft_blk, 'N'); + } + BLOCK_APPEND_CHAR(s->soft_blk, '\0'); + } + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + f.S.pos = pos+1; + f.S.code = 'H'; + f.S.len = len; + cram_stats_add(c->HC_stats, len); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + f.S.pos = pos+1; + f.S.code = 'N'; + f.S.len = len; + cram_stats_add(c->RS_stats, len); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + f.S.pos = pos+1; + f.S.code = 'P'; + f.S.len = len; + cram_stats_add(c->PD_stats, len); + return cram_add_feature(c, s, r, &f); +} + +static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, + int pos, int len, char *base) { + cram_feature f; + f.I.pos = pos+1; + if (len == 1) { + char b = base ? *base : 'N'; + f.i.code = 'i'; + f.i.base = b; +#ifdef BA_external + s->BA_len++; +#else + cram_stats_add(c->BA_stats, b); +#endif + } else { + f.I.code = 'I'; + f.I.len = len; + f.S.seq_idx = BLOCK_SIZE(s->base_blk); + if (base) { + BLOCK_APPEND(s->base_blk, base, len); + } else { + int i; + for (i = 0; i < len; i++) + BLOCK_APPEND_CHAR(s->base_blk, 'N'); + } + BLOCK_APPEND_CHAR(s->base_blk, '\0'); + } + return cram_add_feature(c, s, r, &f); +} + +/* + * Encodes auxiliary data. + * Returns the read-group parsed out of the BAM aux fields on success + * NULL on failure or no rg present (FIXME) + */ +static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, + cram_slice *s, cram_record *cr) { + char *aux, *tmp, *rg = NULL, *tmp_tn; + int aux_size = bam_blk_size(b) - + ((char *)bam_aux(b) - (char *)&bam_ref(b)); + + /* Worst case is 1 nul char on every ??:Z: string, so +33% */ + BLOCK_GROW(s->aux_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_blk); + +#ifdef TN_external + BLOCK_GROW(s->tn_blk, aux_size); + tmp_tn = (char *)BLOCK_END(s->tn_blk); +#endif + + aux = (char *)bam_aux(b); +#ifndef TN_external + cr->TN_idx = s->nTN; +#endif + while (aux[0] != 0) { + int32_t i32; + int r; + + if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { + rg = &aux[3]; + while (*aux++); + continue; + } + if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { + while (*aux++); + continue; + } + if (aux[0] == 'N' && aux[1] == 'M') { + switch(aux[2]) { + case 'A': case 'C': case 'c': aux+=4; break; + case 'I': case 'i': case 'f': aux+=7; break; + default: + fprintf(stderr, "Unhandled type code for NM tag\n"); + return NULL; + } + continue; + } + + cr->ntags++; + + i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; + kh_put(s_i2i, c->tags_used, i32, &r); + if (-1 == r) + return NULL; + +#ifndef TN_external + if (s->nTN >= s->aTN) { + s->aTN = s->aTN ? s->aTN*2 : 1024; + if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) + return NULL; + } + s->TN[s->nTN++] = i32; + cram_stats_add(c->TN_stats, i32); +#else + tmp_tn += itf8_put(tmp_tn, i32); +#endif + + switch(aux[2]) { + case 'A': case 'C': case 'c': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; + break; + + case 'S': case 's': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; + break; + + case 'I': case 'i': case 'f': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + break; + + case 'd': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + break; + + case 'Z': case 'H': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; // stop byte + break; + + case 'B': { + int type = aux[3], blen; + uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + + (((unsigned char *)aux)[5]<< 8) + + (((unsigned char *)aux)[6]<<16) + + (((unsigned char *)aux)[7]<<24)); + // skip TN field + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + + // We use BYTE_ARRAY_LEN with external length, so store that first + switch (type) { + case 'c': case 'C': + blen = count; + break; + case 's': case 'S': + blen = 2*count; + break; + case 'i': case 'I': case 'f': + blen = 4*count; + break; + default: + fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", + type); + return NULL; + + } + + tmp += itf8_put(tmp, blen+5); + + *tmp++=*aux++; // sub-type & length + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + + // The tag data itself + memcpy(tmp, aux, blen); tmp += blen; aux += blen; + + //cram_stats_add(c->aux_B_stats, blen); + break; + } + default: + fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); + return NULL; + } + } + cram_stats_add(c->TC_stats, cr->ntags); + + cr->aux = BLOCK_SIZE(s->aux_blk); + cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); + BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); + assert(s->aux_blk->byte <= s->aux_blk->alloc); + +#ifdef TN_external + cr->tn = BLOCK_SIZE(s->tn_blk); + BLOCK_SIZE(s->tn_blk) = (uc *)tmp_tn - BLOCK_DATA(s->tn_blk); + assert(s->tn_blk->byte <= s->tn_blk->alloc); +#endif + + return rg; +} + +/* + * Encodes auxiliary data. Largely duplicated from above, but done so to + * keep it simple and avoid a myriad of version ifs. + * + * Returns the read-group parsed out of the BAM aux fields on success + * NULL on failure or no rg present (FIXME) + */ +static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, + cram_slice *s, cram_record *cr) { + char *aux, *orig, *tmp, *rg = NULL; +#ifdef SAMTOOLS + int aux_size = bam_get_l_aux(b); +#else + int aux_size = bam_blk_size(b) - + ((char *)bam_aux(b) - (char *)&bam_ref(b)); +#endif + cram_block *td_b = c->comp_hdr->TD_blk; + int TD_blk_size = BLOCK_SIZE(td_b), new; + char *key; + khint_t k; + + + /* Worst case is 1 nul char on every ??:Z: string, so +33% */ + BLOCK_GROW(s->aux_blk, aux_size*1.34+1); + tmp = (char *)BLOCK_END(s->aux_blk); + + + orig = aux = (char *)bam_aux(b); + + // Copy aux keys to td_b and aux values to s->aux_blk + while (aux - orig < aux_size && aux[0] != 0) { + uint32_t i32; + int r; + + if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { + rg = &aux[3]; + while (*aux++); + continue; + } + if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { + while (*aux++); + continue; + } + if (aux[0] == 'N' && aux[1] == 'M') { + switch(aux[2]) { + case 'A': case 'C': case 'c': aux+=4; break; + case 'S': case 's': aux+=5; break; + case 'I': case 'i': case 'f': aux+=7; break; + default: + fprintf(stderr, "Unhandled type code for NM tag\n"); + return NULL; + } + continue; + } + + BLOCK_APPEND(td_b, aux, 3); + + i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; + kh_put(s_i2i, c->tags_used, i32, &r); + if (-1 == r) + return NULL; + + switch(aux[2]) { + case 'A': case 'C': case 'c': + aux+=3; + *tmp++=*aux++; + break; + + case 'S': case 's': + aux+=3; + *tmp++=*aux++; *tmp++=*aux++; + break; + + case 'I': case 'i': case 'f': + aux+=3; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + break; + + case 'd': + aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + + case 'Z': case 'H': + aux+=3; + while ((*tmp++=*aux++)); + *tmp++ = '\t'; // stop byte + break; + + case 'B': { + int type = aux[3], blen; + uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + + (((unsigned char *)aux)[5]<< 8) + + (((unsigned char *)aux)[6]<<16) + + (((unsigned char *)aux)[7]<<24)); + // skip TN field + aux+=3; + + // We use BYTE_ARRAY_LEN with external length, so store that first + switch (type) { + case 'c': case 'C': + blen = count; + break; + case 's': case 'S': + blen = 2*count; + break; + case 'i': case 'I': case 'f': + blen = 4*count; + break; + default: + fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", + type); + return NULL; + + } + + tmp += itf8_put(tmp, blen+5); + + *tmp++=*aux++; // sub-type & length + *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; + + // The tag data itself + memcpy(tmp, aux, blen); tmp += blen; aux += blen; + + //cram_stats_add(c->aux_B_stats, blen); + break; + } + default: + fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); + return NULL; + } + } + + // FIXME: sort BLOCK_DATA(td_b) by char[3] triples + + // And and increment TD hash entry + BLOCK_APPEND_CHAR(td_b, 0); + + // Duplicate key as BLOCK_DATA() can be realloced to a new pointer. + key = string_ndup(c->comp_hdr->TD_keys, + (char *)BLOCK_DATA(td_b) + TD_blk_size, + BLOCK_SIZE(td_b) - TD_blk_size); + k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new); + if (new < 0) { + return NULL; + } else if (new == 0) { + BLOCK_SIZE(td_b) = TD_blk_size; + } else { + kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL; + c->comp_hdr->nTL++; + } + + cr->TL = kh_val(c->comp_hdr->TD_hash, k); + cram_stats_add(c->TL_stats, cr->TL); + + cr->aux = BLOCK_SIZE(s->aux_blk); + cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); + BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); + assert(s->aux_blk->byte <= s->aux_blk->alloc); + + return rg; +} + + +/* + * Handles creation of a new container or new slice, flushing any + * existing containers when appropriate. + * + * Really this is next slice, which may or may not lead to a new container. + * + * Returns cram_container pointer on success + * NULL on failure. + */ +static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { + cram_container *c = fd->ctr; + cram_slice *s; + int i; + + /* First occurence */ + if (c->curr_ref == -2) + c->curr_ref = bam_ref(b); + + if (c->slice) { + s = c->slice; + if (c->multi_seq) { + s->hdr->ref_seq_id = -2; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; + } else { + s->hdr->ref_seq_id = c->curr_ref; + s->hdr->ref_seq_start = c->first_base; + s->hdr->ref_seq_span = c->last_base - c->first_base + 1; + } + s->hdr->num_records = c->curr_rec; + + if (c->curr_slice == 0) { + if (c->ref_seq_id != s->hdr->ref_seq_id) + c->ref_seq_id = s->hdr->ref_seq_id; + c->ref_seq_start = c->first_base; + } + + c->curr_slice++; + } + + /* Flush container */ + if (c->curr_slice == c->max_slice || + (bam_ref(b) != c->curr_ref && !c->multi_seq)) { + c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; + if (fd->verbose) + fprintf(stderr, "Flush container %d/%d..%d\n", + c->ref_seq_id, c->ref_seq_start, + c->ref_seq_start + c->ref_seq_span -1); + + /* Encode slices */ + if (fd->pool) { + if (-1 == cram_flush_container_mt(fd, c)) + return NULL; + } else { + if (-1 == cram_flush_container(fd, c)) + return NULL; + + // Move to sep func, as we need cram_flush_container for + // the closing phase to flush the partial container. + for (i = 0; i < c->max_slice; i++) { + cram_free_slice(c->slices[i]); + c->slices[i] = NULL; + } + + c->slice = NULL; + c->curr_slice = 0; + + /* Easy approach for purposes of freeing stats */ + cram_free_container(c); + } + + c = fd->ctr = cram_new_container(fd->seqs_per_slice, + fd->slices_per_container); + if (!c) + return NULL; + c->record_counter = fd->record_counter; + c->curr_ref = bam_ref(b); + } + + c->last_pos = c->first_base = c->last_base = bam_pos(b)+1; + + /* New slice */ + c->slice = c->slices[c->curr_slice] = + cram_new_slice(MAPPED_SLICE, c->max_rec); + if (!c->slice) + return NULL; + + if (c->multi_seq) { + c->slice->hdr->ref_seq_id = -2; + c->slice->hdr->ref_seq_start = 0; + c->slice->last_apos = 1; + } else { + c->slice->hdr->ref_seq_id = bam_ref(b); + // wrong for unsorted data, will fix during encoding. + c->slice->hdr->ref_seq_start = bam_pos(b)+1; + c->slice->last_apos = bam_pos(b)+1; + } + + c->curr_rec = 0; + + return c; +} + +/* + * Converts a single bam record into a cram record. + * Possibly used within a thread. + * + * Returns 0 on success; + * -1 on failure + */ +static int process_one_read(cram_fd *fd, cram_container *c, + cram_slice *s, cram_record *cr, + bam_seq_t *b, int rnum) { + int i, fake_qual = 0; + char *cp, *rg; + char *ref, *seq, *qual; + + // FIXME: multi-ref containers + + ref = c->ref; + + //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); + + // Fields to resolve later + //cr->mate_line; // index to another cram_record + //cr->mate_flags; // MF + //cr->ntags; // TC + cr->ntags = 0; //cram_stats_add(c->TC_stats, cr->ntags); + if (fd->version == CRAM_1_VERS) + rg = cram_encode_aux_1_0(fd, b, c, s, cr); + else + rg = cram_encode_aux(fd, b, c, s, cr); + + //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b)); + //cr->aux = DSTRING_LEN(s->aux_ds); + //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size); + + /* Read group, identified earlier */ + if (rg) { + SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); + cr->rg = brg ? brg->id : -1; + } else if (fd->version == CRAM_1_VERS) { + SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); + assert(brg); + } else { + cr->rg = -1; + } + cram_stats_add(c->RG_stats, cr->rg); + + + cr->ref_id = bam_ref(b); cram_stats_add(c->RI_stats, cr->ref_id); + cr->flags = bam_flag(b); + if (bam_cigar_len(b) == 0) + cr->flags |= BAM_FUNMAP; + cram_stats_add(c->BF_stats, fd->cram_flag_swap[cr->flags & 0xfff]); + + if (!fd->no_ref) + cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES; + else + cr->cram_flags = 0; + //cram_stats_add(c->CF_stats, cr->cram_flags); + + cr->len = bam_seq_len(b); cram_stats_add(c->RL_stats, cr->len); + c->num_bases += cr->len; + cr->apos = bam_pos(b)+1; + if (c->pos_sorted) { + if (cr->apos < s->last_apos) { + c->pos_sorted = 0; + } else { + cram_stats_add(c->AP_stats, cr->apos - s->last_apos); + s->last_apos = cr->apos; + } + } else { + //cram_stats_add(c->AP_stats, cr->apos); + } + c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos); + + cr->name = BLOCK_SIZE(s->name_blk); + cr->name_len = bam_name_len(b); + cram_stats_add(c->RN_stats, cr->name_len); + + BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); + + + /* + * This seqs_ds is largely pointless and it could reuse the same memory + * over and over. + * s->base_ds is what we need for encoding. + */ + cr->seq = BLOCK_SIZE(s->seqs_blk); + cr->qual = BLOCK_SIZE(s->qual_blk); + BLOCK_GROW(s->seqs_blk, cr->len+1); + BLOCK_GROW(s->qual_blk, cr->len); + seq = cp = (char *)BLOCK_END(s->seqs_blk); + + *seq = 0; + for (i = 0; i < cr->len; i++) { + // FIXME: do 2 char at a time for efficiency +#ifdef SAMTOOLS + cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; +#else + cp[i] = bam_nt16_rev_table[bam_seqi(bam_seq(b), i)]; +#endif + } + BLOCK_SIZE(s->seqs_blk) += cr->len; + + qual = cp = (char *)bam_qual(b); + + /* Copy and parse */ + if (!(cr->flags & BAM_FUNMAP)) { + int32_t *cig_to, *cig_from; + int apos = cr->apos-1, spos = 0; + + cr->cigar = s->ncigar; + cr->ncigar = bam_cigar_len(b); + while (cr->cigar + cr->ncigar >= s->cigar_alloc) { + s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024; + s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); + if (!s->cigar) + return -1; + } + + cig_to = (int32_t *)s->cigar; + cig_from = (int32_t *)bam_cigar(b); + + cr->feature = 0; + cr->nfeature = 0; + for (i = 0; i < cr->ncigar; i++) { + enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK; + int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT; + cig_to[i] = cig_from[i]; + + /* Can also generate events from here for CRAM diffs */ + + switch (cig_op) { + int l; + + // Don't trust = and X ops to be correct. + case BAM_CMATCH: + case BAM_CBASE_MATCH: + case BAM_CBASE_MISMATCH: + //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", + // cig_len, &ref[apos], cig_len, &seq[spos]); + l = 0; + if (!fd->no_ref && cr->len) { + int end = cig_len+apos < c->ref_end + ? cig_len : c->ref_end - apos; + for (l = 0; l < end && seq[spos]; l++, apos++, spos++) { + if (ref[apos] != seq[spos]) { + //fprintf(stderr, "Subst: %d; %c vs %c\n", + // spos, ref[apos], seq[spos]); + if (cram_add_substitution(fd, c, s, cr, spos, + seq[spos], qual[spos], + ref[apos])) + return -1; + } + } + } + + if (l < cig_len && cr->len) { + /* off end of sequence or non-ref based output */ + for (; l < cig_len && seq[spos]; l++, spos++) { + if (cram_add_base(fd, c, s, cr, spos, + seq[spos], qual[spos])) + return -1; + } + apos += cig_len; + } else if (!cr->len) { + /* Seq "*" */ + apos += cig_len; + spos += cig_len; + } + break; + + case BAM_CDEL: + if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos])) + return -1; + apos += cig_len; + break; + + case BAM_CREF_SKIP: + if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos])) + return -1; + apos += cig_len; + break; + + case BAM_CINS: + if (cram_add_insertion(c, s, cr, spos, cig_len, + cr->len ? &seq[spos] : NULL)) + return -1; + if (fd->no_ref && cr->len) { + for (l = 0; l < cig_len; l++, spos++) { + cram_add_quality(fd, c, s, cr, spos, qual[spos]); + } + } else { + spos += cig_len; + } + break; + + case BAM_CSOFT_CLIP: + if (cram_add_softclip(c, s, cr, spos, cig_len, + cr->len ? &seq[spos] : NULL, + fd->version)) + return -1; + if (fd->no_ref) { + if (cr->len) { + for (l = 0; l < cig_len; l++, spos++) { + cram_add_quality(fd, c, s, cr, spos, qual[spos]); + } + } else { + for (l = 0; l < cig_len; l++, spos++) { + cram_add_quality(fd, c, s, cr, spos, -1); + } + } + } else { + spos += cig_len; + } + break; + + case BAM_CHARD_CLIP: + if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos])) + return -1; + break; + + case BAM_CPAD: + if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos])) + return -1; + break; + } + } + fake_qual = spos; + cr->aend = MIN(apos, c->ref_end); + cram_stats_add(c->FN_stats, cr->nfeature); + } else { + // Unmapped + cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; + cr->cigar = 0; + cr->ncigar = 0; + cr->nfeature = 0; + cr->aend = cr->apos; +#ifdef BA_external + s->BA_len += cr->len; +#else + for (i = 0; i < cr->len; i++) + cram_stats_add(c->BA_stats, seq[i]); +#endif + } + + /* + * Append to the qual block now. We do this here as + * cram_add_substitution() can generate BA/QS events which need to + * be in the qual block before we append the rest of the data. + */ + if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) { + /* Special case of seq "*" */ + if (cr->len == 0) { + cram_stats_add(c->RL_stats, cr->len = fake_qual); + BLOCK_GROW(s->qual_blk, cr->len); + cp = (char *)BLOCK_END(s->qual_blk); + memset(cp, 255, cr->len); + } else { + BLOCK_GROW(s->qual_blk, cr->len); + cp = (char *)BLOCK_END(s->qual_blk); + char *from = (char *)&bam_qual(b)[0]; + char *to = &cp[0]; + memcpy(to, from, cr->len); + //for (i = 0; i < cr->len; i++) cp[i] = from[i]; + } + BLOCK_SIZE(s->qual_blk) += cr->len; + } else { + if (cr->len == 0) { + cram_stats_add(c->RL_stats, cr->len = cr->aend - cr->apos + 1); + } + } + + /* Now we know apos and aend both, update mate-pair information */ + { + int new; + khint_t k; + + //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum, + // cr->name_len, DSTRING_STR(s->name_ds)+cr->name); + if (cr->flags & BAM_FPAIRED) { + char *key = string_ndup(s->pair_keys, + (char *)BLOCK_DATA(s->name_blk)+cr->name, + cr->name_len); + if (!key) + return -1; + + k = kh_put(m_s2i, s->pair, key, &new); + if (-1 == new) + return -1; + else if (new > 0) + kh_val(s->pair, k) = rnum; + } else { + new = 1; + } + + if (new == 0) { + cram_record *p = &s->crecs[kh_val(s->pair, k)]; + + //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair, k)); + + // copy from p to cr + cr->mate_pos = p->apos; + cram_stats_add(c->NP_stats, cr->mate_pos); + + cr->tlen = cr->aend - p->apos; + cram_stats_add(c->TS_stats, cr->tlen); + + cr->mate_flags = + ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + + ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; + cram_stats_add(c->MF_stats, cr->mate_flags); + + // copy from cr to p + cram_stats_del(c->NP_stats, p->mate_pos); + p->mate_pos = cr->apos; + cram_stats_add(c->NP_stats, p->mate_pos); + + cram_stats_del(c->MF_stats, p->mate_flags); + p->mate_flags = + ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + + ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; + cram_stats_add(c->MF_stats, p->mate_flags); + + cram_stats_del(c->TS_stats, p->tlen); + p->tlen = p->apos - cr->aend; + cram_stats_add(c->TS_stats, p->tlen); + + // Clear detached from cr flags + //cram_stats_del(c->CF_stats, cr->cram_flags); + cr->cram_flags &= ~CRAM_FLAG_DETACHED; + cram_stats_add(c->CF_stats, cr->cram_flags); + + // Clear detached from p flags and set downstream + cram_stats_del(c->CF_stats, p->cram_flags); + p->cram_flags &= ~CRAM_FLAG_DETACHED; + p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; + cram_stats_add(c->CF_stats, p->cram_flags); + + p->mate_line = rnum - (kh_val(s->pair, k) + 1); + cram_stats_add(c->NF_stats, p->mate_line); + + kh_val(s->pair, k) = rnum; + } else { + //fprintf(stderr, "unpaired\n"); + + /* Derive mate flags from this flag */ + cr->mate_flags = 0; + if (bam_flag(b) & BAM_FMUNMAP) + cr->mate_flags |= CRAM_M_UNMAP; + if (bam_flag(b) & BAM_FMREVERSE) + cr->mate_flags |= CRAM_M_REVERSE; + + cram_stats_add(c->MF_stats, cr->mate_flags); + + cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); + cram_stats_add(c->NP_stats, cr->mate_pos); + + cr->tlen = bam_ins_size(b); + cram_stats_add(c->TS_stats, cr->tlen); + + cr->cram_flags |= CRAM_FLAG_DETACHED; + cram_stats_add(c->CF_stats, cr->cram_flags); + } + } + + cr->mqual = bam_map_qual(b); + cram_stats_add(c->MQ_stats, cr->mqual); + + cr->mate_ref_id = bam_mate_ref(b); + cram_stats_add(c->NS_stats, cr->mate_ref_id); + + if (!(bam_flag(b) & BAM_FUNMAP)) { + if (c->first_base > cr->apos) + c->first_base = cr->apos; + + if (c->last_base < cr->aend) + c->last_base = cr->aend; + } + + return 0; +} + +/* + * Write iterator: put BAM format sequences into a CRAM file. + * We buffer up a containers worth of data at a time. + * + * Returns 0 on success + * -1 on failure + */ +int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { + cram_container *c; + + if (!fd->ctr) { + fd->ctr = cram_new_container(fd->seqs_per_slice, + fd->slices_per_container); + if (!fd->ctr) + return -1; + fd->ctr->record_counter = fd->record_counter; + } + c = fd->ctr; + + if (!c->slice || c->curr_rec == c->max_rec || + (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) { + int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; + int curr_ref = c->slice ? c->curr_ref : bam_ref(b); + + + /* + * Start packing slices when we routinely have under 1/4tr full. + * + * This option isn't available if we choose to embed references + * since we can only have one per slice. + */ + if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && + fd->last_slice && fd->last_slice < c->max_rec/4+10 && + !fd->embed_ref) { + if (fd->verbose && !c->multi_seq) + fprintf(stderr, "Multi-ref enabled for this container\n"); + multi_seq = 1; + } + + slice_rec = c->slice_rec; + curr_rec = c->curr_rec; + + if (fd->version == CRAM_1_VERS || + c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) + if (NULL == (c = cram_next_container(fd, b))) + return -1; + + /* + * Due to our processing order, some things we've already done we + * cannot easily undo. So when we first notice we should be packing + * multiple sequences per container we emit the small partial + * container as-is and then start a fresh one in a different mode. + */ + if (multi_seq) { + fd->multi_seq = 1; + c->multi_seq = 1; + c->pos_sorted = 0; // required atm for multi_seq slices + + if (!c->refs_used) { + pthread_mutex_lock(&fd->ref_lock); + c->refs_used = calloc(fd->refs->nref, sizeof(int)); + pthread_mutex_unlock(&fd->ref_lock); + if (!c->refs_used) + return -1; + } + } + + fd->last_slice = curr_rec - slice_rec; + c->slice_rec = c->curr_rec; + + // Have we seen this reference before? + if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && + !fd->unsorted) { + + if (!c->refs_used) { + pthread_mutex_lock(&fd->ref_lock); + c->refs_used = calloc(fd->refs->nref, sizeof(int)); + pthread_mutex_unlock(&fd->ref_lock); + if (!c->refs_used) + return -1; + } else if (c->refs_used && c->refs_used[bam_ref(b)]) { + fprintf(stderr, "Unsorted mode enabled\n"); + pthread_mutex_lock(&fd->ref_lock); + fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + fd->multi_seq = 1; + } + } + + c->curr_ref = bam_ref(b); + if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++; + } + + if (!c->bams) { + /* First time through, allocate a set of bam pointers */ + pthread_mutex_lock(&fd->bam_list_lock); + if (fd->bl) { + spare_bams *spare = fd->bl; + c->bams = spare->bams; + fd->bl = spare->next; + free(spare); + } else { + c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *)); + if (!c->bams) + return -1; + } + pthread_mutex_unlock(&fd->bam_list_lock); + } + + /* Copy or alloc+copy the bam record, for later encoding */ + if (c->bams[c->curr_c_rec]) + bam_copy(&c->bams[c->curr_c_rec], b); + else + c->bams[c->curr_c_rec] = bam_dup(b); + + c->curr_rec++; + c->curr_c_rec++; + fd->record_counter++; + + return 0; +} diff --git a/star-sys/STAR/source/htslib/cram/cram_encode.h b/star-sys/STAR/source/htslib/cram/cram_encode.h new file mode 100644 index 0000000..9131d6e --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_encode.h @@ -0,0 +1,105 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * Include cram.h instead. + * + * This is an internal part of the CRAM system and is automatically included + * when you #include cram.h. + * + * Implements the encoding portion of CRAM I/O. Also see + * cram_codecs.[ch] for the actual encoding functions themselves. + */ + +#ifndef _CRAM_WRITE_H_ +#define _CRAM_WRITE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---------------------------------------------------------------------- + * CRAM sequence iterators. + */ + +/*! Write iterator: put BAM format sequences into a CRAM file. + * + * We buffer up a containers worth of data at a time. + * + * FIXME: break this into smaller pieces. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b); + + +/* ---------------------------------------------------------------------- + * Internal functions + */ + +/*! INTERNAL: + * Encodes a compression header block into a generic cram_block structure. + * + * @return + * Returns cram_block ptr on success; + * NULL on failure + */ +cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, + cram_block_compression_hdr *h); + +/*! INTERNAL: + * Encodes a slice compression header. + * + * @return + * Returns cram_block on success; + * NULL on failure + */ +cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s); + +/*! INTERNAL: + * Encodes all slices in a container into blocks. + * + * @return + * Returns 0 on success; + * -1 on failure + * + * FIXME: separate into encode_container and write_container. Ideally + * we should be able to do read_container / write_container or + * decode_container / encode_container. + */ +int cram_encode_container(cram_fd *fd, cram_container *c); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/cram/cram_index.c b/star-sys/STAR/source/htslib/cram/cram_index.c new file mode 100644 index 0000000..d78b3e0 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_index.c @@ -0,0 +1,503 @@ +/* +Copyright (c) 2013-2014 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * The index is a gzipped tab-delimited text file with one line per slice. + * The columns are: + * 1: reference number (0 to N-1, as per BAM ref_id) + * 2: reference position of 1st read in slice (1..?) + * 3: number of reads in slice + * 4: offset of container start (relative to end of SAM header, so 1st + * container is offset 0). + * 5: slice number within container (ie which landmark). + * + * In memory, we hold this in a nested containment list. Each list element is + * a cram_index struct. Each element in turn can contain its own list of + * cram_index structs. + * + * Any start..end range which is entirely contained within another (and + * earlier as it is sorted) range will be held within it. This ensures that + * the outer list will never have containments and we can safely do a + * binary search to find the first range which overlaps any given coordinate. + */ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/hfile.h" +#include "cram/cram.h" +#include "cram/os.h" +#include "cram/zfio.h" + +#if 0 +static void dump_index_(cram_index *e, int level) { + int i, n; + n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); + printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); + for (i = 0; i < e->nslice; i++) { + dump_index_(&e->e[i], level+1); + } +} + +static void dump_index(cram_fd *fd) { + int i; + for (i = 0; i < fd->index_sz; i++) { + dump_index_(&fd->index[i], 0); + } +} +#endif + +/* + * Loads a CRAM .crai index into memory. + * + * Returns 0 for success + * -1 for failure + */ +int cram_index_load(cram_fd *fd, const char *fn) { + char fn2[PATH_MAX]; + char buf[65536]; + ssize_t len; + kstring_t kstr = {0}; + hFILE *fp; + cram_index *idx; + cram_index **idx_stack = NULL, *ep, e; + int idx_stack_alloc = 0, idx_stack_ptr = 0; + size_t pos = 0; + + /* Check if already loaded */ + if (fd->index) + return 0; + + fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); + if (!fd->index) + return -1; + + idx = &fd->index[0]; + idx->refid = -1; + idx->start = INT_MIN; + idx->end = INT_MAX; + + idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); + idx_stack[idx_stack_ptr] = idx; + + sprintf(fn2, "%s.crai", fn); + if (!(fp = hopen(fn2, "r"))) { + perror(fn2); + free(idx_stack); + return -1; + } + + // Load the file into memory + while ((len = hread(fp, buf, 65536)) > 0) + kputsn(buf, len, &kstr); + if (len < 0 || kstr.l < 2) { + if (kstr.s) + free(kstr.s); + free(idx_stack); + return -1; + } + + if (hclose(fp)) { + if (kstr.s) + free(kstr.s); + free(idx_stack); + return -1; + } + + + // Uncompress if required + if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { + size_t l; + char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); + free(kstr.s); + if (!s) { + free(idx_stack); + return -1; + } + kstr.s = s; + kstr.l = l; + } + + + // Parse it line at a time + do { + int nchars; + char *line = &kstr.s[pos]; + + /* 1.1 layout */ + if (sscanf(line, "%d\t%d\t%d\t%"PRId64"\t%d\t%d%n", + &e.refid, + &e.start, + &e.end, + &e.offset, + &e.slice, + &e.len, + &nchars) != 6) { + free(kstr.s); + free(idx_stack); + return -1; + } + + e.end += e.start-1; + //printf("%d/%d..%d\n", e.refid, e.start, e.end); + + if (e.refid < -1) { + free(kstr.s); + free(idx_stack); + fprintf(stderr, "Malformed index file, refid %d\n", e.refid); + return -1; + } + + if (e.refid != idx->refid) { + if (fd->index_sz < e.refid+2) { + size_t index_end = fd->index_sz * sizeof(*fd->index); + fd->index_sz = e.refid+2; + fd->index = realloc(fd->index, + fd->index_sz * sizeof(*fd->index)); + memset(((char *)fd->index) + index_end, 0, + fd->index_sz * sizeof(*fd->index) - index_end); + } + idx = &fd->index[e.refid+1]; + idx->refid = e.refid; + idx->start = INT_MIN; + idx->end = INT_MAX; + idx->nslice = idx->nalloc = 0; + idx->e = NULL; + idx_stack[(idx_stack_ptr = 0)] = idx; + } + + while (!(e.start >= idx->start && e.end <= idx->end)) { + idx = idx_stack[--idx_stack_ptr]; + } + + // Now contains, so append + if (idx->nslice+1 >= idx->nalloc) { + idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; + idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); + } + + e.nalloc = e.nslice = 0; e.e = NULL; + *(ep = &idx->e[idx->nslice++]) = e; + idx = ep; + + if (++idx_stack_ptr >= idx_stack_alloc) { + idx_stack_alloc *= 2; + idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); + } + idx_stack[idx_stack_ptr] = idx; + + pos += nchars; + while (pos < kstr.l && kstr.s[pos] != '\n') + pos++; + pos++; + } while (pos < kstr.l); + + free(idx_stack); + free(kstr.s); + + // dump_index(fd); + + return 0; +} + +static void cram_index_free_recurse(cram_index *e) { + if (e->e) { + int i; + for (i = 0; i < e->nslice; i++) { + cram_index_free_recurse(&e->e[i]); + } + free(e->e); + } +} + +void cram_index_free(cram_fd *fd) { + int i; + + if (!fd->index) + return; + + for (i = 0; i < fd->index_sz; i++) { + cram_index_free_recurse(&fd->index[i]); + } + free(fd->index); + + fd->index = NULL; +} + +/* + * Searches the index for the first slice overlapping a reference ID + * and position, or one immediately preceeding it if none is found in + * the index to overlap this position. (Our index may have missing + * entries, but we require at least one per reference.) + * + * If the index finds multiple slices overlapping this position we + * return the first one only. Subsequent calls should specifying + * "from" as the last slice we checked to find the next one. Otherwise + * set "from" to be NULL to find the first one. + * + * Returns the cram_index pointer on sucess + * NULL on failure + */ +cram_index *cram_index_query(cram_fd *fd, int refid, int pos, + cram_index *from) { + int i, j, k; + cram_index *e; + + if (refid+1 < 0 || refid+1 >= fd->index_sz) + return NULL; + + i = 0, j = fd->index[refid+1].nslice-1; + + if (!from) + from = &fd->index[refid+1]; + + for (k = j/2; k != i; k = (j-i)/2 + i) { + if (from->e[k].refid > refid) { + j = k; + continue; + } + + if (from->e[k].refid < refid) { + i = k; + continue; + } + + if (from->e[k].start >= pos) { + j = k; + continue; + } + + if (from->e[k].start < pos) { + i = k; + continue; + } + } + + /* The above found *a* bin overlapping, but not necessarily the first */ + while (i > 0 && from->e[i-1].end >= pos) + i--; + + /* Special case for matching a start pos */ + if (i+1 < from->nslice && + from->e[i+1].start == pos && + from->e[i+1].refid == refid) + i++; + + e = &from->e[i]; + + return e; +} + + +/* + * Skips to a container overlapping the start coordinate listed in + * cram_range. + * + * In theory we call cram_index_query multiple times, once per slice + * overlapping the range. However slices may be absent from the index + * which makes this problematic. Instead we find the left-most slice + * and then read from then on, skipping decoding of slices and/or + * whole containers when they don't overlap the specified cram_range. + * + * Returns 0 on success + * -1 on failure + */ +int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { + cram_index *e; + + // Ideally use an index, so see if we have one. + if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { + if (0 != cram_seek(fd, e->offset, SEEK_SET)) + if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) + return -1; + } else { + fprintf(stderr, "Unknown reference ID. Missing from index?\n"); + return -1; + } + + if (fd->ctr) { + cram_free_container(fd->ctr); + fd->ctr = NULL; + } + + return 0; +} + + +/* + * A specialised form of cram_index_build (below) that deals with slices + * having multiple references in this (ref_id -2). In this scenario we + * decode the slice to look at the RI data series instead. + * + * Returns 0 on success + * -1 on failure + */ +static int cram_index_build_multiref(cram_fd *fd, + cram_container *c, + cram_slice *s, + zfp *fp, + off_t cpos, + int32_t landmark, + int sz) { + int i, ref = -2, ref_start = 0, ref_end; + char buf[1024]; + + if (0 != cram_decode_slice(fd, c, s, fd->header)) + return -1; + + ref_end = INT_MIN; + for (i = 0; i < s->hdr->num_records; i++) { + if (s->crecs[i].ref_id == ref) { + if (ref_end < s->crecs[i].aend) + ref_end = s->crecs[i].aend; + continue; + } + + if (ref != -2) { + sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); + zfputs(buf, fp); + } + + ref = s->crecs[i].ref_id; + ref_start = s->crecs[i].apos; + ref_end = INT_MIN; + } + + if (ref != -2) { + sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); + zfputs(buf, fp); + } + + return 0; +} + +/* + * Builds an index file. + * + * fd is a newly opened cram file that we wish to index. + * fn_base is the filename of the associated CRAM file. Internally we + * add ".crai" to this to get the index filename. + * + * Returns 0 on success + * -1 on failure + */ +int cram_index_build(cram_fd *fd, const char *fn_base) { + cram_container *c; + off_t cpos, spos, hpos; + zfp *fp; + char fn_idx[PATH_MAX]; + + if (strlen(fn_base) > PATH_MAX-6) + return -1; + + sprintf(fn_idx, "%s.crai", fn_base); + if (!(fp = zfopen(fn_idx, "wz"))) { + perror(fn_idx); + return -1; + } + + cpos = htell(fd->fp); + while ((c = cram_read_container(fd))) { + int j; + + if (fd->err) { + perror("Cram container read"); + return 1; + } + + hpos = htell(fd->fp); + + if (!(c->comp_hdr_block = cram_read_block(fd))) + return 1; + assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); + + c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); + if (!c->comp_hdr) + return -1; + + // 2.0 format + for (j = 0; j < c->num_landmarks; j++) { + char buf[1024]; + cram_slice *s; + int sz; + + spos = htell(fd->fp); + assert(spos - cpos - c->offset == c->landmark[j]); + + if (!(s = cram_read_slice(fd))) { + zfclose(fp); + return -1; + } + + sz = (int)(htell(fd->fp) - spos); + + if (s->hdr->ref_seq_id == -2) { + cram_index_build_multiref(fd, c, s, fp, + cpos, c->landmark[j], sz); + } else { + sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_span, (int64_t)cpos, + c->landmark[j], sz); + zfputs(buf, fp); + } + + cram_free_slice(s); + } + + cpos = htell(fd->fp); + assert(cpos == hpos + c->length); + + cram_free_container(c); + } + if (fd->err) { + zfclose(fp); + return -1; + } + + + return zfclose(fp); +} diff --git a/star-sys/STAR/source/htslib/cram/cram_index.h b/star-sys/STAR/source/htslib/cram/cram_index.h new file mode 100644 index 0000000..5e3b8f5 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_index.h @@ -0,0 +1,98 @@ +/* +Copyright (c) 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_INDEX_H_ +#define _CRAM_INDEX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Loads a CRAM .crai index into memory. + * Returns 0 for success + * -1 for failure + */ +int cram_index_load(cram_fd *fd, const char *fn); + +void cram_index_free(cram_fd *fd); + +/* + * Searches the index for the first slice overlapping a reference ID + * and position. + * + * Returns the cram_index pointer on sucess + * NULL on failure + */ +cram_index *cram_index_query(cram_fd *fd, int refid, int pos, cram_index *frm); + +/* + * Skips to a container overlapping the start coordinate listed in + * cram_range. + * + * Returns 0 on success + * -1 on failure + */ +int cram_seek_to_refpos(cram_fd *fd, cram_range *r); + +void cram_index_free(cram_fd *fd); + +/* + * Skips to a container overlapping the start coordinate listed in + * cram_range. + * + * In theory we call cram_index_query multiple times, once per slice + * overlapping the range. However slices may be absent from the index + * which makes this problematic. Instead we find the left-most slice + * and then read from then on, skipping decoding of slices and/or + * whole containers when they don't overlap the specified cram_range. + * + * Returns 0 on success + * -1 on failure + */ +int cram_seek_to_refpos(cram_fd *fd, cram_range *r); + +/* + * Builds an index file. + * + * fd is a newly opened cram file that we wish to index. + * fn_base is the filename of the associated CRAM file. Internally we + * add ".crai" to this to get the index filename. + * + * Returns 0 on success + * -1 on failure + */ +int cram_index_build(cram_fd *fd, const char *fn_base); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/cram/cram_io.c b/star-sys/STAR/source/htslib/cram/cram_io.c new file mode 100644 index 0000000..8abc59d --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_io.c @@ -0,0 +1,3652 @@ +/* +Copyright (c) 2012-2014 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * CRAM I/O primitives. + * + * - ITF8 encoding and decoding. + * - Block based I/O + * - Zlib inflating and deflating (memory) + * - CRAM basic data structure reading and writing + * - File opening / closing + * - Reference sequence handling + */ + +/* + * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need + * a way to return errors for when malloc fails. + */ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_LIBBZ2 +#include +#endif +#include +#include +#include +#include + +#include "cram/cram.h" +#include "cram/os.h" +#include "cram/md5.h" +#include "cram/open_trace_file.h" + +//#define REF_DEBUG + +#ifdef REF_DEBUG +#include +#define gettid() (int)syscall(SYS_gettid) + +#define RP(...) fprintf (stderr, __VA_ARGS__) +#else +#define RP(...) +#endif + +#ifdef SAMTOOLS +#include "htslib/hfile.h" +#define paranoid_hclose(fp) (hclose(fp)) +#else +#define hclose_abruptly(fp) (fclose(fp)) +#define hflush(fp) (fflush(fp)) +#define hgetc(fp) (getc(fp)) +#define hputc(c, fp) (putc((c), (fp))) +#define hread(fp, buffer, nbytes) (fread((buffer), 1, (nbytes), (fp))) +#define hseek(fp, offset, whence) (fseeko((fp), (offset), (whence))) +#define hwrite(fp, buffer, nbytes) (fwrite((buffer), 1, (nbytes), (fp))) +#define paranoid_hclose(fp) (paranoid_fclose(fp)) +#endif + +/* ---------------------------------------------------------------------- + * ITF8 encoding and decoding. + * +* Also see the itf8_get and itf8_put macros in cram_io.h + */ + +/* + * Reads an integer in ITF-8 encoding from 'cp' and stores it in + * *val. + * + * Returns the number of bytes read on success + * -1 on failure + */ +int itf8_decode(cram_fd *fd, int32_t *val_p) { + static int nbytes[16] = { + 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx + 1,1,1,1, // 1000xxxx - 1011xxxx + 2,2, // 1100xxxx - 1101xxxx + 3, // 1110xxxx + 4, // 1111xxxx + }; + + static int nbits[16] = { + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx + 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx + 0x1f, 0x1f, // 1100xxxx - 1101xxxx + 0x0f, // 1110xxxx + 0x0f, // 1111xxxx + }; + + int32_t val = hgetc(fd->fp); + if (val == -1) + return -1; + + int i = nbytes[val>>4]; + val &= nbits[val>>4]; + + switch(i) { + case 0: + *val_p = val; + return 1; + + case 1: + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val; + return 2; + + case 2: + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val; + return 3; + + case 3: + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val; + return 4; + + case 4: // really 3.5 more, why make it different? + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f); + *val_p = val; + } + + return 5; +} + +/* + * Encodes and writes a single integer in ITF-8 format. + * Returns 0 on success + * -1 on failure + */ +int itf8_encode(cram_fd *fd, int32_t val) { + char buf[5]; + int len = itf8_put(buf, val); + return hwrite(fd->fp, buf, len) == len ? 0 : -1; +} + +#ifndef ITF8_MACROS +/* + * As above, but decoding from memory + */ +int itf8_get(char *cp, int32_t *val_p) { + unsigned char *up = (unsigned char *)cp; + + if (up[0] < 0x80) { + *val_p = up[0]; + return 1; + } else if (up[0] < 0xc0) { + *val_p = ((up[0] <<8) | up[1]) & 0x3fff; + return 2; + } else if (up[0] < 0xe0) { + *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; + return 3; + } else if (up[0] < 0xf0) { + *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; + return 4; + } else { + *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); + return 5; + } +} + +/* + * Stores a value to memory in ITF-8 format. + * + * Returns the number of bytes required to store the number. + * This is a maximum of 5 bytes. + */ +int itf8_put(char *cp, int32_t val) { + if (!(val & ~0x00000007f)) { // 1 byte + *cp = val; + return 1; + } else if (!(val & ~0x00003fff)) { // 2 byte + *cp++ = (val >> 8 ) | 0x80; + *cp = val & 0xff; + return 2; + } else if (!(val & ~0x01fffff)) { // 3 byte + *cp++ = (val >> 16) | 0xc0; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 3; + } else if (!(val & ~0x0fffffff)) { // 4 byte + *cp++ = (val >> 24) | 0xe0; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 4; + } else { // 5 byte + *cp++ = 0xf0 | ((val>>28) & 0xff); + *cp++ = (val >> 20) & 0xff; + *cp++ = (val >> 12) & 0xff; + *cp++ = (val >> 4 ) & 0xff; + *cp = val & 0x0f; + return 5; + } +} +#endif + +/* 64-bit itf8 variant */ +int ltf8_put(char *cp, int64_t val) { + if (!(val & ~((1LL<<7)-1))) { + *cp = val; + return 1; + } else if (!(val & ~((1LL<<(6+8))-1))) { + *cp++ = (val >> 8 ) | 0x80; + *cp = val & 0xff; + return 2; + } else if (!(val & ~((1LL<<(5+2*8))-1))) { + *cp++ = (val >> 16) | 0xc0; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 3; + } else if (!(val & ~((1LL<<(4+3*8))-1))) { + *cp++ = (val >> 24) | 0xe0; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 4; + } else if (!(val & ~((1LL<<(3+4*8))-1))) { + *cp++ = (val >> 32) | 0xf0; + *cp++ = (val >> 24) & 0xff; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 5; + } else if (!(val & ~((1LL<<(2+5*8))-1))) { + *cp++ = (val >> 40) | 0xf8; + *cp++ = (val >> 32) & 0xff; + *cp++ = (val >> 24) & 0xff; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 6; + } else if (!(val & ~((1LL<<(1+6*8))-1))) { + *cp++ = (val >> 48) | 0xfc; + *cp++ = (val >> 40) & 0xff; + *cp++ = (val >> 32) & 0xff; + *cp++ = (val >> 24) & 0xff; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 7; + } else if (!(val & ~((1LL<<(7*8))-1))) { + *cp++ = (val >> 56) | 0xfe; + *cp++ = (val >> 48) & 0xff; + *cp++ = (val >> 40) & 0xff; + *cp++ = (val >> 32) & 0xff; + *cp++ = (val >> 24) & 0xff; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 8; + } else { + *cp++ = 0xff; + *cp++ = (val >> 56) & 0xff; + *cp++ = (val >> 48) & 0xff; + *cp++ = (val >> 40) & 0xff; + *cp++ = (val >> 32) & 0xff; + *cp++ = (val >> 24) & 0xff; + *cp++ = (val >> 16) & 0xff; + *cp++ = (val >> 8 ) & 0xff; + *cp = val & 0xff; + return 9; + } +} + +int ltf8_get(char *cp, int64_t *val_p) { + unsigned char *up = (unsigned char *)cp; + + if (up[0] < 0x80) { + *val_p = up[0]; + return 1; + } else if (up[0] < 0xc0) { + *val_p = (((uint64_t)up[0]<< 8) | + (uint64_t)up[1]) & (((1LL<<(6+8)))-1); + return 2; + } else if (up[0] < 0xe0) { + *val_p = (((uint64_t)up[0]<<16) | + ((uint64_t)up[1]<< 8) | + (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); + return 3; + } else if (up[0] < 0xf0) { + *val_p = (((uint64_t)up[0]<<24) | + ((uint64_t)up[1]<<16) | + ((uint64_t)up[2]<< 8) | + (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); + return 4; + } else if (up[0] < 0xf8) { + *val_p = (((uint64_t)up[0]<<32) | + ((uint64_t)up[1]<<24) | + ((uint64_t)up[2]<<16) | + ((uint64_t)up[3]<< 8) | + (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); + return 5; + } else if (up[0] < 0xfc) { + *val_p = (((uint64_t)up[0]<<40) | + ((uint64_t)up[1]<<32) | + ((uint64_t)up[2]<<24) | + ((uint64_t)up[3]<<16) | + ((uint64_t)up[4]<< 8) | + (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); + return 6; + } else if (up[0] < 0xfe) { + *val_p = (((uint64_t)up[0]<<48) | + ((uint64_t)up[1]<<40) | + ((uint64_t)up[2]<<32) | + ((uint64_t)up[3]<<24) | + ((uint64_t)up[4]<<16) | + ((uint64_t)up[5]<< 8) | + (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); + return 7; + } else if (up[0] < 0xff) { + *val_p = (((uint64_t)up[1]<<48) | + ((uint64_t)up[2]<<40) | + ((uint64_t)up[3]<<32) | + ((uint64_t)up[4]<<24) | + ((uint64_t)up[5]<<16) | + ((uint64_t)up[6]<< 8) | + (uint64_t)up[7]) & ((1LL<<(7*8))-1); + return 8; + } else { + *val_p = (((uint64_t)up[1]<<56) | + ((uint64_t)up[2]<<48) | + ((uint64_t)up[3]<<40) | + ((uint64_t)up[4]<<32) | + ((uint64_t)up[5]<<24) | + ((uint64_t)up[6]<<16) | + ((uint64_t)up[7]<< 8) | + (uint64_t)up[8]); + return 9; + } +} + +int ltf8_decode(cram_fd *fd, int64_t *val_p) { + int c = hgetc(fd->fp); + int64_t val = (unsigned char)c; + if (c == -1) + return -1; + + if (val < 0x80) { + *val_p = val; + return 1; + + } else if (val < 0xc0) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & (((1LL<<(6+8)))-1); + return 2; + + } else if (val < 0xe0) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(5+2*8))-1); + return 3; + + } else if (val < 0xf0) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(4+3*8))-1); + return 4; + + } else if (val < 0xf8) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(3+4*8))-1); + return 5; + + } else if (val < 0xfc) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(2+5*8))-1); + return 6; + + } else if (val < 0xfe) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(1+6*8))-1); + return 7; + + } else if (val < 0xff) { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val & ((1LL<<(7*8))-1); + return 8; + + } else { + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + val = (val<<8) | (unsigned char)hgetc(fd->fp); + *val_p = val; + } + + return 9; +} + +/* + * Pushes a value in ITF8 format onto the end of a block. + * This shouldn't be used for high-volume data as it is not the fastest + * method. + * + * Returns the number of bytes written + */ +int itf8_put_blk(cram_block *blk, int val) { + char buf[5]; + int sz; + + sz = itf8_put(buf, val); + BLOCK_APPEND(blk, buf, sz); + return sz; +} + +/* + * Decodes a 32-bit little endian value from fd and stores in val. + * + * Returns the number of bytes read on success + * -1 on failure + */ +int int32_decode(cram_fd *fd, int32_t *val) { + int32_t i; + if (4 != hread(fd->fp, &i, 4)) + return -1; + + *val = le_int4(i); + return 4; +} + +/* + * Encodes a 32-bit little endian value 'val' and writes to fd. + * + * Returns the number of bytes written on success + * -1 on failure + */ +int int32_encode(cram_fd *fd, int32_t val) { + val = le_int4(val); + if (4 != hwrite(fd->fp, &val, 4)) + return -1; + + return 4; +} + +/* As int32_decoded/encode, but from/to blocks instead of cram_fd */ +int int32_get(cram_block *b, int32_t *val) { + if (b->uncomp_size - BLOCK_SIZE(b) < 4) + return -1; + + *val = + b->data[b->byte ] | + (b->data[b->byte+1] << 8) | + (b->data[b->byte+2] << 16) | + (b->data[b->byte+3] << 24); + BLOCK_SIZE(b) += 4; + return 4; +} + +/* As int32_decoded/encode, but from/to blocks instead of cram_fd */ +int int32_put(cram_block *b, int32_t val) { + unsigned char cp[4]; + cp[0] = ( val & 0xff); + cp[1] = ((val>>8) & 0xff); + cp[2] = ((val>>16) & 0xff); + cp[3] = ((val>>24) & 0xff); + + BLOCK_APPEND(b, cp, 4); + return b->data ? 0 : -1; +} + +/* ---------------------------------------------------------------------- + * zlib compression code - from Gap5's tg_iface_g.c + * They're static here as they're only used within the cram_compress_block + * and cram_uncompress_block functions, which are the external interface. + */ +char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { + z_stream s; + unsigned char *data = NULL; /* Uncompressed output */ + int data_alloc = 0; + int err; + + /* Starting point at uncompressed size, and scale after that */ + data = malloc(data_alloc = csize*1.2+100); + if (!data) + return NULL; + + /* Initialise zlib stream */ + s.zalloc = Z_NULL; /* use default allocation functions */ + s.zfree = Z_NULL; + s.opaque = Z_NULL; + s.next_in = (unsigned char *)cdata; + s.avail_in = csize; + s.total_in = 0; + s.next_out = data; + s.avail_out = data_alloc; + s.total_out = 0; + + //err = inflateInit(&s); + err = inflateInit2(&s, 15 + 32); + if (err != Z_OK) { + fprintf(stderr, "zlib inflateInit error: %s\n", s.msg); + free(data); + return NULL; + } + + /* Decode to 'data' array */ + for (;s.avail_in;) { + unsigned char *data_tmp; + int alloc_inc; + + s.next_out = &data[s.total_out]; + err = inflate(&s, Z_NO_FLUSH); + if (err == Z_STREAM_END) + break; + + if (err != Z_OK) { + fprintf(stderr, "zlib inflate error: %s\n", s.msg); + break; + } + + /* More to come, so realloc based on growth so far */ + alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100; + data = realloc((data_tmp = data), data_alloc += alloc_inc); + if (!data) { + free(data_tmp); + return NULL; + } + s.avail_out += alloc_inc; + } + inflateEnd(&s); + + *size = s.total_out; + return (char *)data; +} + +static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, + int level, int strat) { + z_stream s; + unsigned char *cdata = NULL; /* Compressed output */ + int cdata_alloc = 0; + int cdata_pos = 0; + int err; + + cdata = malloc(cdata_alloc = size*1.05+100); + if (!cdata) + return NULL; + cdata_pos = 0; + + /* Initialise zlib stream */ + s.zalloc = Z_NULL; /* use default allocation functions */ + s.zfree = Z_NULL; + s.opaque = Z_NULL; + s.next_in = (unsigned char *)data; + s.avail_in = size; + s.total_in = 0; + s.next_out = cdata; + s.avail_out = cdata_alloc; + s.total_out = 0; + s.data_type = Z_BINARY; + + err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat); + if (err != Z_OK) { + fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg); + return NULL; + } + + /* Encode to 'cdata' array */ + for (;s.avail_in;) { + s.next_out = &cdata[cdata_pos]; + s.avail_out = cdata_alloc - cdata_pos; + if (cdata_alloc - cdata_pos <= 0) { + fprintf(stderr, "Deflate produced larger output than expected. Abort\n"); + return NULL; + } + err = deflate(&s, Z_NO_FLUSH); + cdata_pos = cdata_alloc - s.avail_out; + if (err != Z_OK) { + fprintf(stderr, "zlib deflate error: %s\n", s.msg); + break; + } + } + if (deflate(&s, Z_FINISH) != Z_STREAM_END) { + fprintf(stderr, "zlib deflate error: %s\n", s.msg); + } + *cdata_size = s.total_out; + + if (deflateEnd(&s) != Z_OK) { + fprintf(stderr, "zlib deflate error: %s\n", s.msg); + } + return (char *)cdata; +} + +/* ---------------------------------------------------------------------- + * CRAM blocks - the dynamically growable data block. We have code to + * create, update, (un)compress and read/write. + * + * These are derived from the deflate_interlaced.c blocks, but with the + * CRAM extension of content types and IDs. + */ + +/* + * Allocates a new cram_block structure with a specified content_type and + * id. + * + * Returns block pointer on success + * NULL on failure + */ +cram_block *cram_new_block(enum cram_content_type content_type, + int content_id) { + cram_block *b = malloc(sizeof(*b)); + if (!b) + return NULL; + b->method = b->orig_method = RAW; + b->content_type = content_type; + b->content_id = content_id; + b->comp_size = 0; + b->uncomp_size = 0; + b->data = NULL; + b->alloc = 0; + b->byte = 0; + b->bit = 7; // MSB + + return b; +} + +/* + * Reads a block from a cram file. + * Returns cram_block pointer on success. + * NULL on failure + */ +cram_block *cram_read_block(cram_fd *fd) { + cram_block *b = malloc(sizeof(*b)); + if (!b) + return NULL; + + //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); + + if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } + if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } + if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; } + if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; } + if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; } + + // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", + // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); + + if (b->method == RAW) { + b->alloc = b->uncomp_size; + if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; } + if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) { + free(b->data); + free(b); + return NULL; + } + } else { + b->alloc = b->comp_size; + if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } + if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { + free(b->data); + free(b); + return NULL; + } + } + + b->orig_method = b->method; + b->idx = 0; + b->byte = 0; + b->bit = 7; // MSB + + return b; +} + +/* + * Writes a CRAM block. + * Returns 0 on success + * -1 on failure + */ +int cram_write_block(cram_fd *fd, cram_block *b) { + assert(b->method != RAW || (b->comp_size == b->uncomp_size)); + + if (hputc(b->method, fd->fp) == EOF) return -1; + if (hputc(b->content_type, fd->fp) == EOF) return -1; + if (itf8_encode(fd, b->content_id) == -1) return -1; + if (itf8_encode(fd, b->comp_size) == -1) return -1; + if (itf8_encode(fd, b->uncomp_size) == -1) return -1; + + if (b->method == RAW) { + if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size)) + return -1; + } else { + if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size)) + return -1; + } + + return 0; +} + +/* + * Frees a CRAM block, deallocating internal data too. + */ +void cram_free_block(cram_block *b) { + if (!b) + return; + if (b->data) + free(b->data); + free(b); +} + +/* + * Uncompresses a CRAM block, if compressed. + */ +int cram_uncompress_block(cram_block *b) { + char *uncomp; + size_t uncomp_size = 0; + + if (b->uncomp_size == 0) { + // blank block + b->method = RAW; + return 0; + } + + switch (b->method) { + case RAW: + b->uncomp_size = b->comp_size; + return 0; + + case GZIP: + uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); + if (!uncomp) + return -1; + if ((int)uncomp_size != b->uncomp_size) + return -1; + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = uncomp_size; + b->method = RAW; + break; + +#ifdef HAVE_LIBBZ2 + case BZIP2: { + unsigned int usize = b->uncomp_size; + if (!(uncomp = malloc(usize))) + return -1; + if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize, + (char *)b->data, b->comp_size, + 0, 0)) { + free(uncomp); + return -1; + } + b->data = (unsigned char *)uncomp; + b->alloc = usize; + b->method = RAW; + b->uncomp_size = usize; // Just incase it differs + break; + } +#else + case BZIP2: + fprintf(stderr, "Bzip2 compression is not compiled into this " + "version.\nPlease rebuild and try again.\n"); + return -1; +#endif + + case BM_ERROR: + default: + return -1; + } + + return 0; +} + +#ifdef HAVE_LIBBZ2 +static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b, + cram_metrics *metrics, int level) { + unsigned int comp_size = b->uncomp_size*1.01 + 600; + char *comp = malloc(comp_size); + char *data = (char *)b->data; + + if (!comp) + return -1; + + if (!data) + data = ""; + + if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, + data, b->uncomp_size, + level, 0, 30)) { + free(comp); + return -1; + } + + free(b->data); + b->data = (unsigned char *)comp; + b->method = BZIP2; + b->comp_size = comp_size; + + if (fd->verbose) + fprintf(stderr, "Compressed block ID %d from %d to %d\n", + b->content_id, b->uncomp_size, b->comp_size); + + return 0; +} +#endif + +/* + * Compresses a block using one of two different zlib strategies. If we only + * want one choice set strat2 to be -1. + * + * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED + * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is + * significantly faster. + */ +int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int level, int strat, + int level2, int strat2) { + char *comp = NULL; + size_t comp_size = 0; + + if (level == 0) { + b->method = RAW; + b->comp_size = b->uncomp_size; + return 0; + } + + if (b->method != RAW) { + fprintf(stderr, "Attempt to compress an already compressed block.\n"); + return 0; + } + +#ifdef HAVE_LIBBZ2 + if (fd->use_bz2) + // metrics ignored for bzip2 + return cram_compress_block_bzip2(fd, b, metrics, level); +#endif + + pthread_mutex_lock(&fd->metrics_lock); + if (strat2 >= 0) + if (fd->verbose > 1) + fprintf(stderr, "metrics trial %d, next_trial %d, m1 %d, m2 %d\n", + metrics->trial, metrics->next_trial, + metrics->m1, metrics->m2); + + if (strat2 >= 0 && (metrics->trial > 0 || --metrics->next_trial <= 0)) { + char *c1, *c2; + size_t s1, s2; + + if (metrics->next_trial == 0) { + metrics->next_trial = 100; + metrics->trial = 3; + metrics->m1 = metrics->m2 = 0; + } + pthread_mutex_unlock(&fd->metrics_lock); + + c1 = zlib_mem_deflate((char *)b->data, b->uncomp_size, + &s1, level, strat); + c2 = zlib_mem_deflate((char *)b->data, b->uncomp_size, + &s2, level2, strat2); + if (!c1 || !c2) + return -1; + + //fprintf(stderr, "1: %6d 2: %6d %5.1f\n", s1, s2, 100.0*s1/s2); + + pthread_mutex_lock(&fd->metrics_lock); + if (s1 < 0.98 * s2) { // 2nd one should be faster alternative + if (fd->verbose > 1) + fprintf(stderr, "M1 wins %d vs %d\n", (int)s1, (int)s2); + comp = c1; comp_size = s1; + free(c2); + metrics->m1++; + } else { + if (fd->verbose > 1) + fprintf(stderr, "M2 wins %d vs %d\n", (int)s1, (int)s2); + comp = c2; comp_size = s2; + free(c1); + metrics->m2++; + } + metrics->trial--; + pthread_mutex_unlock(&fd->metrics_lock); + } else if (strat2 >= 0) { + int xlevel = metrics->m1 > metrics->m2 ? level : level2; + int xstrat = metrics->m1 > metrics->m2 ? strat : strat2; + pthread_mutex_unlock(&fd->metrics_lock); + comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size, + xlevel, xstrat); + } else { + pthread_mutex_unlock(&fd->metrics_lock); + comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size, + level, strat); + } + + if (!comp) + return -1; + + free(b->data); + b->data = (unsigned char *)comp; + b->method = GZIP; + b->comp_size = comp_size; + + if (fd->verbose) + fprintf(stderr, "Compressed block ID %d from %d to %d\n", + b->content_id, b->uncomp_size, b->comp_size); + + return 0; +} + +cram_metrics *cram_new_metrics(void) { + cram_metrics *m = malloc(sizeof(*m)); + if (!m) + return NULL; + m->m1 = m->m2 = 0; + m->trial = 2; + m->next_trial = 100; + return m; +} + +char *cram_block_method2str(enum cram_block_method m) { + switch(m) { + case RAW: return "RAW"; + case GZIP: return "GZIP"; + case BZIP2: return "BZIP2"; + case BM_ERROR: break; + } + return "?"; +} + +char *cram_content_type2str(enum cram_content_type t) { + switch (t) { + case FILE_HEADER: return "FILE_HEADER"; + case COMPRESSION_HEADER: return "COMPRESSION_HEADER"; + case MAPPED_SLICE: return "MAPPED_SLICE"; + case UNMAPPED_SLICE: return "UNMAPPED_SLICE"; + case EXTERNAL: return "EXTERNAL"; + case CORE: return "CORE"; + case CT_ERROR: break; + } + return "?"; +} + +/* + * Extra error checking on fclose to really ensure data is written. + * Care needs to be taken to handle pipes vs real files. + * + * Returns 0 on success + * -1 on failure. + */ +int paranoid_fclose(FILE *fp) { + if (-1 == fflush(fp) && errno != EBADF) { + fclose(fp); + return -1; + } + + errno = 0; + if (-1 == fsync(fileno(fp))) { + if (errno != EINVAL) { // eg pipe + fclose(fp); + return -1; + } + } + return fclose(fp); +} + +/* ---------------------------------------------------------------------- + * Reference sequence handling + * + * These revolve around the refs_t structure, which may potentially be + * shared between multiple cram_fd. + * + * We start with refs_create() to allocate an empty refs_t and then + * populate it with @SQ line data using refs_from_header(). This is done on + * cram_open(). Also at start up we can call cram_load_reference() which + * is used with "scramble -r foo.fa". This replaces the fd->refs with the + * new one specified. In either case refs2id() is then called which + * maps ref_entry names to @SQ ids (refs_t->ref_id[]). + * + * Later, possibly within a thread, we will want to know the actual ref + * seq itself, obtained by calling cram_get_ref(). This may use the + * UR: or M5: fields or the filename specified in the original + * cram_load_reference() call. + * + * Given the potential for multi-threaded reference usage, we have + * reference counting (sorry for the confusing double use of "ref") to + * track the number of callers interested in any specific reference. + */ + +void refs_free(refs_t *r) { + RP("refs_free()\n"); + + if (--r->count > 0) + return; + + if (!r) + return; + + if (r->pool) + string_pool_destroy(r->pool); + + if (r->h_meta) { + khint_t k; + + for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) { + ref_entry *e; + + if (!kh_exist(r->h_meta, k)) + continue; + if (!(e = kh_val(r->h_meta, k))) + continue; + if (e->seq) + free(e->seq); + free(e); + } + + kh_destroy(refs, r->h_meta); + } + + if (r->ref_id) + free(r->ref_id); + + if (r->fp) + fclose(r->fp); + + pthread_mutex_destroy(&r->lock); + + free(r); +} + +static refs_t *refs_create(void) { + refs_t *r = calloc(1, sizeof(*r)); + + RP("refs_create()\n"); + + if (!r) + return NULL; + + if (!(r->pool = string_pool_create(8192))) + goto err; + + r->ref_id = NULL; // see refs2id() to populate. + r->count = 1; + r->last = NULL; + r->last_id = -1; + + if (!(r->h_meta = kh_init(refs))) + goto err; + + pthread_mutex_init(&r->lock, NULL); + + return r; + + err: + refs_free(r); + return NULL; +} + +/* + * Loads a FAI file for a reference.fasta. + * "is_err" indicates whether failure to load is worthy of emitting an + * error message. In some cases (eg with embedded references) we + * speculatively load, just incase, and silently ignore errors. + * + * Returns the refs_t struct on success (maybe newly allocated); + * NULL on failure + */ +static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { + struct stat sb; + FILE *fp = NULL; + char fai_fn[PATH_MAX]; + char line[8192]; + refs_t *r = r_orig; + size_t fn_l = strlen(fn); + + RP("refs_load_fai %s\n", fn); + + if (!r) + if (!(r = refs_create())) + goto err; + + /* Open reference, for later use */ + if (stat(fn, &sb) != 0) { + if (is_err) + perror(fn); + goto err; + } + + if (r->fp) + fclose(r->fp); + r->fp = NULL; + + if (!(r->fn = string_dup(r->pool, fn))) + goto err; + + if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) + r->fn[fn_l-4] = 0; + + if (!(r->fp = fopen(r->fn, "r"))) { + if (is_err) + perror(fn); + goto err; + } + + /* Parse .fai file and load meta-data */ + sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn); + + if (stat(fai_fn, &sb) != 0) { + if (is_err) + perror(fai_fn); + goto err; + } + if (!(fp = fopen(fai_fn, "r"))) { + if (is_err) + perror(fai_fn); + goto err; + } + while (fgets(line, 8192, fp) != NULL) { + ref_entry *e = malloc(sizeof(*e)); + char *cp; + int n; + khint_t k; + + if (!e) + return NULL; + + // id + for (cp = line; *cp && !isspace(*cp); cp++) + ; + *cp++ = 0; + e->name = string_dup(r->pool, line); + + // length + while (*cp && isspace(*cp)) + cp++; + e->length = strtoll(cp, &cp, 10); + + // offset + while (*cp && isspace(*cp)) + cp++; + e->offset = strtoll(cp, &cp, 10); + + // bases per line + while (*cp && isspace(*cp)) + cp++; + e->bases_per_line = strtol(cp, &cp, 10); + + // line length + while (*cp && isspace(*cp)) + cp++; + e->line_length = strtol(cp, &cp, 10); + + // filename + e->fn = r->fn; + + e->count = 0; + e->seq = NULL; + + k = kh_put(refs, r->h_meta, e->name, &n); + if (-1 == n) { + free(e); + return NULL; + } + + if (n) { + kh_val(r->h_meta, k) = e; + } else { + ref_entry *re = kh_val(r->h_meta, k); + if (re && (re->count != 0 || re->length != 0)) { + /* Keep old */ + free(e); + } else { + /* Replace old */ + if (re) + free(re); + kh_val(r->h_meta, k) = e; + } + } + } + + return r; + + err: + if (fp) + fclose(fp); + + if (!r_orig) + refs_free(r); + + return NULL; +} + +/* + * Indexes references by the order they appear in a BAM file. This may not + * necessarily be the same order they appear in the fasta reference file. + * + * Returns 0 on success + * -1 on failure + */ +int refs2id(refs_t *r, SAM_hdr *h) { + int i; + + if (r->ref_id) + free(r->ref_id); + if (r->last) + r->last = NULL; + + r->ref_id = calloc(h->nref, sizeof(*r->ref_id)); + if (!r->ref_id) + return -1; + + r->nref = h->nref; + for (i = 0; i < h->nref; i++) { + khint_t k = kh_get(refs, r->h_meta, h->ref[i].name); + if (k != kh_end(r->h_meta)) { + r->ref_id[i] = kh_val(r->h_meta, k); + } else { + fprintf(stderr, "Unable to find ref name '%s'\n", + h->ref[i].name); + } + } + + return 0; +} + +/* + * Generates refs_t entries based on @SQ lines in the header. + * Returns 0 on success + * -1 on failure + */ +static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { + int i; + + if (!h || h->nref == 0) + return 0; + + //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); + + /* Existing refs are fine, as long as they're compatible with the hdr. */ + i = r->nref; + if (r->nref < h->nref) + r->nref = h->nref; + + if (!(r->ref_id = realloc(r->ref_id, r->nref * sizeof(*r->ref_id)))) + return -1; + + for (; i < r->nref; i++) + r->ref_id[i] = NULL; + + /* Copy info from h->ref[i] over to r */ + for (i = 0; i < h->nref; i++) { + SAM_hdr_type *ty; + SAM_hdr_tag *tag; + khint_t k; + int n; + + if (r->ref_id[i] && 0 == strcmp(r->ref_id[i]->name, h->ref[i].name)) + continue; + + if (!(r->ref_id[i] = calloc(1, sizeof(ref_entry)))) + return -1; + + if (!h->ref[i].name) + return -1; + + r->ref_id[i]->name = string_dup(r->pool, h->ref[i].name); + r->ref_id[i]->length = 0; // marker for not yet loaded + + /* Initialise likely filename if known */ + if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) { + if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) { + r->ref_id[i]->fn = string_dup(r->pool, tag->str+3); + //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[i]->name, r->ref_id[i]->fn); + } + } + + k = kh_put(refs, r->h_meta, r->ref_id[i]->name, &n); + if (n <= 0) // already exists or error + return -1; + kh_val(r->h_meta, k) = r->ref_id[i]; + } + + return 0; +} + +/* + * Attaches a header to a cram_fd. + * + * This should be used when creating a new cram_fd for writing where + * we have an SAM_hdr already constructed (eg from a file we've read + * in). + */ +int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { + fd->header = hdr; + return refs_from_header(fd->refs, fd, hdr); +} + +/* + * Converts a directory and a filename into an expanded path, replacing %s + * in directory with the filename and %[0-9]+s with portions of the filename + * Any remaining parts of filename are added to the end with /%s. + */ +void expand_cache_path(char *path, char *dir, char *fn) { + char *cp; + + while ((cp = strchr(dir, '%'))) { + strncpy(path, dir, cp-dir); + path += cp-dir; + + if (*++cp == 's') { + strcpy(path, fn); + path += strlen(fn); + fn += strlen(fn); + cp++; + } else if (*cp >= '0' && *cp <= '9') { + char *endp; + long l; + + l = strtol(cp, &endp, 10); + l = MIN(l, strlen(fn)); + if (*endp == 's') { + strncpy(path, fn, l); + path += l; + fn += l; + *path = 0; + cp = endp+1; + } else { + *path++ = '%'; + *path++ = *cp++; + } + } else { + *path++ = '%'; + *path++ = *cp++; + } + dir = cp; + } + strcpy(path, dir); + path += strlen(dir); + if (*fn && path[-1] != '/') + *path++ = '/'; + strcpy(path, fn); +} + +/* + * Make the directory containing path and any prefix directories. + */ +void mkdir_prefix(char *path, int mode) { + char *cp = strrchr(path, '/'); + if (!cp) + return; + + *cp = 0; + if (is_directory(path)) { + *cp = '/'; + return; + } + + if (mkdir(path, mode) == 0) { + chmod(path, mode); + *cp = '/'; + return; + } + + mkdir_prefix(path, mode); + mkdir(path, mode); + chmod(path, mode); + *cp = '/'; +} + +/* + * Queries the M5 string from the header and attempts to populate the + * reference from this using the REF_PATH environment. + * + * Returns 0 on sucess + * -1 on failure + */ +static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { + char *ref_path = getenv("REF_PATH"); + SAM_hdr_type *ty; + SAM_hdr_tag *tag; + char path[PATH_MAX], path_tmp[PATH_MAX]; + char *local_cache = getenv("REF_CACHE"); + mFILE *mf; + + if (fd->verbose) + fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id); + + if (!ref_path) + ref_path = "."; + + if (!r->name) + return -1; + + if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name))) + return -1; + + if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL))) + goto no_M5; + + if (fd->verbose) + fprintf(stderr, "Querying ref %s\n", tag->str+3); + + /* Use cache if available */ + if (local_cache && *local_cache) { + struct stat sb; + FILE *fp; + + expand_cache_path(path, local_cache, tag->str+3); + + if (0 == stat(path, &sb) && (fp = fopen(path, "r"))) { + r->length = sb.st_size; + r->offset = r->line_length = r->bases_per_line = 0; + + r->fn = string_dup(fd->refs->pool, path); + + if (fd->refs->fp) + fclose(fd->refs->fp); + fd->refs->fp = fp; + fd->refs->fn = r->fn; + + // Fall back to cram_get_ref() where it'll do the actual + // reading of the file. + return 0; + } + } + + /* Otherwise search */ + if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { + size_t sz; + r->seq = mfsteal(mf, &sz); + r->length = sz; + } else { + refs_t *refs; + char *fn; + + no_M5: + /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ + if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL))) + return -1; + + fn = (strncmp(tag->str+3, "file:", 5) == 0) + ? tag->str+8 + : tag->str+3; + + if (fd->refs->fp) { + fclose(fd->refs->fp); + fd->refs->fp = NULL; + } + if (!(refs = refs_load_fai(fd->refs, fn, 0))) + return -1; + fd->refs = refs; + if (fd->refs->fp) { + fclose(fd->refs->fp); + fd->refs->fp = NULL; + } + + if (!fd->refs->fn) + return -1; + + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + if (!fd->refs->ref_id || !fd->refs->ref_id[id]) + return -1; + + // Local copy already, so fall back to cram_get_ref(). + return 0; + } + + /* Populate the local disk cache if required */ + if (local_cache && *local_cache) { + FILE *fp; + int i; + + expand_cache_path(path, local_cache, tag->str+3); + if (fd->verbose) + fprintf(stderr, "Path='%s'\n", path); + mkdir_prefix(path, 01777); + + i = 0; + do { + sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i); + i++; + fp = fopen(path_tmp, "wx"); + } while (fp == NULL && errno == EEXIST); + if (!fp) { + perror(path_tmp); + + // Not fatal - we have the data already so keep going. + return 0; + } + + if (r->length != fwrite(r->seq, 1, r->length, fp)) { + perror(path); + } + if (-1 == paranoid_fclose(fp)) { + unlink(path_tmp); + } else { + if (0 == chmod(path_tmp, 0444)) + rename(path_tmp, path); + else + unlink(path_tmp); + } + } + + return 0; +} + +static void cram_ref_incr_locked(refs_t *r, int id) { + RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1); + + if (id < 0 || !r->ref_id[id]->seq) + return; + + if (r->last_id == id) + r->last_id = -1; + + ++r->ref_id[id]->count; +} + +void cram_ref_incr(refs_t *r, int id) { + pthread_mutex_lock(&r->lock); + cram_ref_incr_locked(r, id); + pthread_mutex_unlock(&r->lock); +} + +static void cram_ref_decr_locked(refs_t *r, int id) { + RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1); + + if (id < 0 || !r->ref_id[id]->seq) { + assert(r->ref_id[id]->count >= 0); + return; + } + + if (--r->ref_id[id]->count <= 0) { + assert(r->ref_id[id]->count == 0); + if (r->last_id >= 0) { + if (r->ref_id[r->last_id]->count <= 0 && + r->ref_id[r->last_id]->seq) { + RP("%d FREE REF %d (%p)\n", gettid(), + r->last_id, r->ref_id[r->last_id]->seq); + free(r->ref_id[r->last_id]->seq); + r->ref_id[r->last_id]->seq = NULL; + r->ref_id[r->last_id]->length = 0; + } + r->last_id = -1; + } else { + r->last_id = id; + } + } +} + +void cram_ref_decr(refs_t *r, int id) { + pthread_mutex_lock(&r->lock); + cram_ref_decr_locked(r, id); + pthread_mutex_unlock(&r->lock); +} + +/* + * Used by cram_ref_load and cram_ref_get. The file handle will have + * already been opened, so we can catch it. The ref_entry *e informs us + * of whether this is a multi-line fasta file or a raw MD5 style file. + * Either way we create a single contiguous sequence. + * + * Returns all or part of a reference sequence on success (malloced); + * NULL on failure. + */ +static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) { + off_t offset, len; + char *seq; + + if (end < start) + end = start; + + /* + * Compute locations in file. This is trivial for the MD5 files, but + * is still necessary for the fasta variants. + */ + offset = e->line_length + ? e->offset + (start-1)/e->bases_per_line * e->line_length + + (start-1) % e->bases_per_line + : start-1; + + len = (e->line_length + ? e->offset + (end-1)/e->bases_per_line * e->line_length + + (end-1) % e->bases_per_line + : end-1) - offset + 1; + + if (0 != fseeko(fp, offset, SEEK_SET)) { + perror("fseeko() on reference file"); + return NULL; + } + + if (len == 0 || !(seq = malloc(len))) { + return NULL; + } + + if (len != fread(seq, 1, len, fp)) { + perror("fread() on reference file"); + free(seq); + return NULL; + } + + /* Strip white-space if required. */ + if (len != end-start+1) { + int i, j; + char *cp = seq; + char *cp_to; + + for (i = j = 0; i < len; i++) { + if (cp[i] >= '!' && cp[i] <= '~') + cp[j++] = cp[i] & ~0x20; + } + cp_to = cp+j; + + if (cp_to - seq != end-start+1) { + fprintf(stderr, "Malformed reference file?\n"); + free(seq); + return NULL; + } + } else { + int i; + for (i = 0; i < len; i++) { + seq[i] = seq[i] & ~0x20; // uppercase in ASCII + } + } + + return seq; +} + +/* + * Load the entire reference 'id'. + * This also increments the reference count by 1. + * + * Returns ref_entry on success; + * NULL on failure + */ +ref_entry *cram_ref_load(refs_t *r, int id) { + ref_entry *e = r->ref_id[id]; + int start = 1, end = e->length; + char *seq; + + if (e->seq) { + return e; + } + + assert(e->count == 0); + + if (r->last) { +#ifdef REF_DEBUG + int idx = 0; + for (idx = 0; idx < r->nref; idx++) + if (r->last == r->ref_id[idx]) + break; + RP("%d cram_ref_load DECR %d\n", gettid(), idx); +#endif + assert(r->last->count > 0); + if (--r->last->count <= 0) { + RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq); + if (r->last->seq) { + free(r->last->seq); + r->last->seq = NULL; + } + } + } + + /* Open file if it's not already the current open reference */ + if (strcmp(r->fn, e->fn) || r->fp == NULL) { + if (r->fp) + fclose(r->fp); + r->fn = e->fn; + if (!(r->fp = fopen(r->fn, "r"))) { + perror(r->fn); + return NULL; + } + } + + RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); + + if (!(seq = load_ref_portion(r->fp, e, start, end))) { + return NULL; + } + + RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq); + + RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1)); + e->seq = seq; + e->count++; + + /* + * Also keep track of last used ref so incr/decr loops on the same + * sequence don't cause load/free loops. + */ + RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1); + r->last = e; + e->count++; + + return e; +} + +/* + * Returns a portion of a reference sequence from start to end inclusive. + * The returned pointer is owned by either the cram_file fd or by the + * internal refs_t structure and should not be freed by the caller. + * + * The difference is whether or not this refs_t is in use by just the one + * cram_fd or by multiples, or whether we have multiple threads accessing + * references. In either case fd->shared will be true and we start using + * reference counting to track the number of users of a specific reference + * sequence. + * + * Otherwise the ref seq returned is allocated as part of cram_fd itself + * and will be freed up on the next call to cram_get_ref or cram_close. + * + * To return the entire reference sequence, specify start as 1 and end + * as 0. + * + * To cease using a reference, call cram_ref_decr(). + * + * Returns reference on success, + * NULL on failure + */ +char *cram_get_ref(cram_fd *fd, int id, int start, int end) { + ref_entry *r; + char *seq; + int ostart = start; + + if (id == -1) + return NULL; + + /* FIXME: axiomatic query of r->seq being true? + * Or shortcut for unsorted data where we load once and never free? + */ + + //fd->shared_ref = 1; // hard code for now to simplify things + + pthread_mutex_lock(&fd->ref_lock); + + RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end); + + /* + * Unsorted data implies we want to fetch an entire reference at a time. + * We just deal with this at the moment by claiming we're sharing + * references instead, which has the same requirement. + */ + if (fd->unsorted) + fd->shared_ref = 1; + + + /* Sanity checking: does this ID exist? */ + if (id >= fd->refs->nref) { + fprintf(stderr, "No reference found for id %d\n", id); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + if (!fd->refs || !fd->refs->ref_id[id]) { + fprintf(stderr, "No reference found for id %d\n", id); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + if (!(r = fd->refs->ref_id[id])) { + fprintf(stderr, "No reference found for id %d\n", id); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + + /* + * It has an entry, but may not have been populated yet. + * Any manually loaded .fai files have their lengths known. + * A ref entry computed from @SQ lines (M5 or UR field) will have + * r->length == 0 unless it's been loaded once and verified that we have + * an on-disk filename for it. + * + * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls + * open_path_mfile and libcurl, which isn't multi-thread safe unless I + * rewrite my code to have one curl handle per thread. + */ + pthread_mutex_lock(&fd->refs->lock); + if (r->length == 0) { + if (cram_populate_ref(fd, id, r) == -1) { + fprintf(stderr, "Failed to populate reference for id %d\n", id); + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + r = fd->refs->ref_id[id]; + } + + + /* + * We now know that we the filename containing the reference, so check + * for limits. If it's over half the reference we'll load all of it in + * memory as this will speed up subsequent calls. + */ + if (end < 1) + end = r->length; + if (end >= r->length) + end = r->length; + assert(start >= 1); + + if (end - start >= 0.5*r->length || fd->shared_ref) { + start = 1; + end = r->length; + } + + /* + * Maybe we have it cached already? If so use it. + * + * Alternatively if we don't have the sequence but we're sharing + * references and/or are asking for the entire length of it, then + * load the full reference into the refs structure and return + * a pointer to that one instead. + */ + if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) { + char *cp; + + if (id >= 0) { + if (r->seq) { + cram_ref_incr_locked(fd->refs, id); + } else { + ref_entry *e; + if (!(e = cram_ref_load(fd->refs, id))) { + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + /* unsorted data implies cache ref indefinitely, to avoid + * continually loading and unloading. + */ + if (fd->unsorted) + cram_ref_incr_locked(fd->refs, id); + } + + fd->ref = NULL; /* We never access it directly */ + fd->ref_start = 1; + fd->ref_end = r->length; + fd->ref_id = id; + + cp = fd->refs->ref_id[id]->seq + ostart-1; + } else { + fd->ref = NULL; + cp = NULL; + } + + RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count); + + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return cp; + } + + /* + * Otherwise we're not sharing, we don't have a copy of it already and + * we're only asking for a small portion of it. + * + * In this case load up just that segment ourselves, freeing any old + * small segments in the process. + */ + + /* Unmapped ref ID */ + if (id < 0) { + if (fd->ref_free) { + free(fd->ref_free); + fd->ref_free = NULL; + } + fd->ref = NULL; + fd->ref_id = id; + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + /* Open file if it's not already the current open reference */ + if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { + if (fd->refs->fp) + fclose(fd->refs->fp); + fd->refs->fn = r->fn; + if (!(fd->refs->fp = fopen(fd->refs->fn, "r"))) { + perror(fd->refs->fn); + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + } + + if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) { + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + return NULL; + } + + if (fd->ref_free) + free(fd->ref_free); + + fd->ref_id = id; + fd->ref_start = start; + fd->ref_end = end; + fd->ref_free = fd->ref; + seq = fd->ref; + + pthread_mutex_unlock(&fd->refs->lock); + pthread_mutex_unlock(&fd->ref_lock); + + return seq + ostart - start; +} + +/* + * If fd has been opened for reading, it may be permitted to specify 'fn' + * as NULL and let the code auto-detect the reference by parsing the + * SAM header @SQ lines. + */ +int cram_load_reference(cram_fd *fd, char *fn) { + if (fn) { + fd->refs = refs_load_fai(fd->refs, fn, + !(fd->embed_ref && fd->mode == 'r')); + fn = fd->refs ? fd->refs->fn : NULL; + } + fd->ref_fn = fn; + + if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { + if (!(fd->refs = refs_create())) + return -1; + if (-1 == refs_from_header(fd->refs, fd, fd->header)) + return -1; + } + + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + + return fn ? 0 : -1; +} + +/* ---------------------------------------------------------------------- + * Containers + */ + +/* + * Creates a new container, specifying the maximum number of slices + * and records permitted. + * + * Returns cram_container ptr on success + * NULL on failure + */ +cram_container *cram_new_container(int nrec, int nslice) { + cram_container *c = calloc(1, sizeof(*c)); + if (!c) + return NULL; + + c->curr_ref = -2; + + c->max_c_rec = nrec * nslice; + c->curr_c_rec = 0; + + c->max_rec = nrec; + c->record_counter = 0; + c->num_bases = 0; + + c->max_slice = nslice; + c->curr_slice = 0; + + c->pos_sorted = 1; + c->max_apos = 0; + c->multi_seq = 0; + + c->bams = NULL; + + if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *)))) + goto err; + c->slice = NULL; + + if (!(c->comp_hdr = cram_new_compression_header())) + goto err; + c->comp_hdr_block = NULL; + + if (!(c->BF_stats = cram_stats_create())) goto err; + if (!(c->CF_stats = cram_stats_create())) goto err; + if (!(c->RN_stats = cram_stats_create())) goto err; + if (!(c->AP_stats = cram_stats_create())) goto err; + if (!(c->RG_stats = cram_stats_create())) goto err; + if (!(c->MQ_stats = cram_stats_create())) goto err; + if (!(c->NS_stats = cram_stats_create())) goto err; + if (!(c->NP_stats = cram_stats_create())) goto err; + if (!(c->TS_stats = cram_stats_create())) goto err; + if (!(c->MF_stats = cram_stats_create())) goto err; + if (!(c->NF_stats = cram_stats_create())) goto err; + if (!(c->RL_stats = cram_stats_create())) goto err; + if (!(c->FN_stats = cram_stats_create())) goto err; + if (!(c->FC_stats = cram_stats_create())) goto err; + if (!(c->FP_stats = cram_stats_create())) goto err; + if (!(c->DL_stats = cram_stats_create())) goto err; + if (!(c->BA_stats = cram_stats_create())) goto err; + if (!(c->QS_stats = cram_stats_create())) goto err; + if (!(c->BS_stats = cram_stats_create())) goto err; + if (!(c->TC_stats = cram_stats_create())) goto err; + if (!(c->TN_stats = cram_stats_create())) goto err; + if (!(c->TL_stats = cram_stats_create())) goto err; + if (!(c->RI_stats = cram_stats_create())) goto err; + if (!(c->RS_stats = cram_stats_create())) goto err; + if (!(c->PD_stats = cram_stats_create())) goto err; + if (!(c->HC_stats = cram_stats_create())) goto err; + + //c->aux_B_stats = cram_stats_create(); + + if (!(c->tags_used = kh_init(s_i2i))) + goto err; + c->refs_used = 0; + + return c; + + err: + if (c) { + if (c->slices) + free(c->slices); + free(c); + } + return NULL; +} + +void cram_free_container(cram_container *c) { + int i; + + if (!c) + return; + + if (c->refs_used) + free(c->refs_used); + + if (c->landmark) + free(c->landmark); + + if (c->comp_hdr) + cram_free_compression_header(c->comp_hdr); + + if (c->comp_hdr_block) + cram_free_block(c->comp_hdr_block); + + if (c->slices) { + for (i = 0; i < c->max_slice; i++) + if (c->slices[i]) + cram_free_slice(c->slices[i]); + free(c->slices); + } + + if (c->TS_stats) cram_stats_free(c->TS_stats); + if (c->RG_stats) cram_stats_free(c->RG_stats); + if (c->FP_stats) cram_stats_free(c->FP_stats); + if (c->NS_stats) cram_stats_free(c->NS_stats); + if (c->RN_stats) cram_stats_free(c->RN_stats); + if (c->CF_stats) cram_stats_free(c->CF_stats); + if (c->TN_stats) cram_stats_free(c->TN_stats); + if (c->BA_stats) cram_stats_free(c->BA_stats); + if (c->TV_stats) cram_stats_free(c->TV_stats); + if (c->BS_stats) cram_stats_free(c->BS_stats); + if (c->FC_stats) cram_stats_free(c->FC_stats); + if (c->BF_stats) cram_stats_free(c->BF_stats); + if (c->AP_stats) cram_stats_free(c->AP_stats); + if (c->NF_stats) cram_stats_free(c->NF_stats); + if (c->MF_stats) cram_stats_free(c->MF_stats); + if (c->FN_stats) cram_stats_free(c->FN_stats); + if (c->RL_stats) cram_stats_free(c->RL_stats); + if (c->DL_stats) cram_stats_free(c->DL_stats); + if (c->TC_stats) cram_stats_free(c->TC_stats); + if (c->TL_stats) cram_stats_free(c->TL_stats); + if (c->MQ_stats) cram_stats_free(c->MQ_stats); + if (c->TM_stats) cram_stats_free(c->TM_stats); + if (c->QS_stats) cram_stats_free(c->QS_stats); + if (c->NP_stats) cram_stats_free(c->NP_stats); + if (c->RI_stats) cram_stats_free(c->RI_stats); + if (c->RS_stats) cram_stats_free(c->RS_stats); + if (c->PD_stats) cram_stats_free(c->PD_stats); + if (c->HC_stats) cram_stats_free(c->HC_stats); + + //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); + + if (c->tags_used) kh_destroy(s_i2i, c->tags_used); + + free(c); +} + +/* + * Reads a container header. + * + * Returns cram_container on success + * NULL on failure or no container left (fd->err == 0). + */ +cram_container *cram_read_container(cram_fd *fd) { + cram_container c2, *c; + int i, s; + size_t rd = 0; + + fd->err = 0; + + memset(&c2, 0, sizeof(c2)); + if (fd->version == CRAM_1_VERS) { + if ((s = itf8_decode(fd, &c2.length)) == -1) { + fd->eof = fd->empty_container ? 1 : 2; + return NULL; + } else { + rd+=s; + } + } else { + if ((s = int32_decode(fd, &c2.length)) == -1) { + fd->eof = fd->empty_container ? 1 : 2; + return NULL; + } else { + rd+=s; + } + } + if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s; + if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s; + if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s; + if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s; + + if (fd->version == CRAM_1_VERS) { + c2.record_counter = 0; + c2.num_bases = 0; + } else { + if ((s = itf8_decode(fd, &c2.record_counter)) == -1) + return NULL; + else + rd += s; + + if ((s = ltf8_decode(fd, &c2.num_bases))== -1) + return NULL; + else + rd += s; + } + if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s; + if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s; + + if (!(c = calloc(1, sizeof(*c)))) + return NULL; + + *c = c2; + + if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) && + c->num_landmarks) { + fd->err = errno; + cram_free_container(c); + return NULL; + } + for (i = 0; i < c->num_landmarks; i++) { + if ((s = itf8_decode(fd, &c->landmark[i])) == -1) { + cram_free_container(c); + return NULL; + } else { + rd += s; + } + } + c->offset = rd; + + c->slices = NULL; + c->curr_slice = 0; + c->max_slice = c->num_landmarks; + c->slice_rec = 0; + c->curr_rec = 0; + c->max_rec = 0; + + if (c->ref_seq_id == -2) { + c->multi_seq = 1; + fd->multi_seq = 1; + } + + fd->empty_container = + (c->num_records == 0 && + c->ref_seq_id == -1 && + c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0; + + return c; +} + +/* + * Writes a container structure. + * + * Returns 0 on success + * -1 on failure + */ +int cram_write_container(cram_fd *fd, cram_container *c) { + char buf_a[1024], *buf = buf_a, *cp; + int i; + + if (50 + c->num_landmarks * 5 >= 1024) + buf = malloc(50 + c->num_landmarks * 5); + cp = buf; + + if (fd->version == CRAM_1_VERS) { + cp += itf8_put(cp, c->length); + } else { + *(int32_t *)cp = le_int4(c->length); + cp += 4; + } + if (c->multi_seq) { + cp += itf8_put(cp, -2); + cp += itf8_put(cp, 0); + cp += itf8_put(cp, 0); + } else { + cp += itf8_put(cp, c->ref_seq_id); + cp += itf8_put(cp, c->ref_seq_start); + cp += itf8_put(cp, c->ref_seq_span); + } + cp += itf8_put(cp, c->num_records); + if (fd->version != CRAM_1_VERS) { + cp += itf8_put(cp, c->record_counter); + cp += ltf8_put(cp, c->num_bases); + } + cp += itf8_put(cp, c->num_blocks); + cp += itf8_put(cp, c->num_landmarks); + for (i = 0; i < c->num_landmarks; i++) + cp += itf8_put(cp, c->landmark[i]); + if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { + if (buf != buf_a) + free(buf); + return -1; + } + + if (buf != buf_a) + free(buf); + + return 0; +} + +// common component shared by cram_flush_container{,_mt} +static int cram_flush_container2(cram_fd *fd, cram_container *c) { + int i, j; + + //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); + + /* Write the container struct itself */ + if (0 != cram_write_container(fd, c)) + return -1; + + /* And the compression header */ + if (0 != cram_write_block(fd, c->comp_hdr_block)) + return -1; + + /* Followed by the slice blocks */ + for (i = 0; i < c->curr_slice; i++) { + cram_slice *s = c->slices[i]; + + if (0 != cram_write_block(fd, s->hdr_block)) + return -1; + + for (j = 0; j < s->hdr->num_blocks; j++) { + if (0 != cram_write_block(fd, s->block[j])) + return -1; + } + } + + return hflush(fd->fp) == 0 ? 0 : -1; +} + +/* + * Flushes a completely or partially full container to disk, writing + * container structure, header and blocks. This also calls the encoder + * functions. + * + * Returns 0 on success + * -1 on failure + */ +int cram_flush_container(cram_fd *fd, cram_container *c) { + /* Encode the container blocks and generate compression header */ + if (0 != cram_encode_container(fd, c)) + return -1; + + return cram_flush_container2(fd, c); +} + +typedef struct { + cram_fd *fd; + cram_container *c; +} cram_job; + +void *cram_flush_thread(void *arg) { + cram_job *j = (cram_job *)arg; + + /* Encode the container blocks and generate compression header */ + if (0 != cram_encode_container(j->fd, j->c)) { + fprintf(stderr, "cram_encode_container failed\n"); + return NULL; + } + + return arg; +} + +static int cram_flush_result(cram_fd *fd) { + int i, ret = 0; + t_pool_result *r; + + while ((r = t_pool_next_result(fd->rqueue))) { + cram_job *j = (cram_job *)r->data; + cram_container *c; + + if (!j) { + t_pool_delete_result(r, 0); + return -1; + } + + fd = j->fd; + c = j->c; + + if (0 != cram_flush_container2(fd, c)) + return -1; + + /* Free the container */ + for (i = 0; i < c->max_slice; i++) { + cram_free_slice(c->slices[i]); + c->slices[i] = NULL; + } + + c->slice = NULL; + c->curr_slice = 0; + + cram_free_container(c); + + ret |= hflush(fd->fp) == 0 ? 0 : -1; + + t_pool_delete_result(r, 1); + } + + return ret; +} + +int cram_flush_container_mt(cram_fd *fd, cram_container *c) { + cram_job *j; + + if (!fd->pool) + return cram_flush_container(fd, c); + + if (!(j = malloc(sizeof(*j)))) + return -1; + j->fd = fd; + j->c = c; + + t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j); + + return cram_flush_result(fd); +} + +/* ---------------------------------------------------------------------- + * Compression headers; the first part of the container + */ + +/* + * Creates a new blank container compression header + * + * Returns header ptr on success + * NULL on failure + */ +cram_block_compression_hdr *cram_new_compression_header(void) { + cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); + if (!hdr) + return NULL; + + if (!(hdr->TD_blk = cram_new_block(CORE, 0))) { + free(hdr); + return NULL; + } + + if (!(hdr->TD_hash = kh_init(m_s2i))) { + cram_free_block(hdr->TD_blk); + free(hdr); + return NULL; + } + + if (!(hdr->TD_keys = string_pool_create(8192))) { + kh_destroy(m_s2i, hdr->TD_hash); + cram_free_block(hdr->TD_blk); + free(hdr); + return NULL; + } + + return hdr; +} + +void cram_free_compression_header(cram_block_compression_hdr *hdr) { + int i; + + if (hdr->landmark) + free(hdr->landmark); + + if (hdr->preservation_map) + kh_destroy(map, hdr->preservation_map); + + for (i = 0; i < CRAM_MAP_HASH; i++) { + cram_map *m, *m2; + for (m = hdr->rec_encoding_map[i]; m; m = m2) { + m2 = m->next; + if (m->codec) + m->codec->free(m->codec); + free(m); + } + } + + for (i = 0; i < CRAM_MAP_HASH; i++) { + cram_map *m, *m2; + for (m = hdr->tag_encoding_map[i]; m; m = m2) { + m2 = m->next; + if (m->codec) + m->codec->free(m->codec); + free(m); + } + } + + if (hdr->BF_codec) hdr->BF_codec->free(hdr->BF_codec); + if (hdr->CF_codec) hdr->CF_codec->free(hdr->CF_codec); + if (hdr->RL_codec) hdr->RL_codec->free(hdr->RL_codec); + if (hdr->AP_codec) hdr->AP_codec->free(hdr->AP_codec); + if (hdr->RG_codec) hdr->RG_codec->free(hdr->RG_codec); + if (hdr->MF_codec) hdr->MF_codec->free(hdr->MF_codec); + if (hdr->NS_codec) hdr->NS_codec->free(hdr->NS_codec); + if (hdr->NP_codec) hdr->NP_codec->free(hdr->NP_codec); + if (hdr->TS_codec) hdr->TS_codec->free(hdr->TS_codec); + if (hdr->NF_codec) hdr->NF_codec->free(hdr->NF_codec); + if (hdr->TC_codec) hdr->TC_codec->free(hdr->TC_codec); + if (hdr->TN_codec) hdr->TN_codec->free(hdr->TN_codec); + if (hdr->TL_codec) hdr->TL_codec->free(hdr->TL_codec); + if (hdr->FN_codec) hdr->FN_codec->free(hdr->FN_codec); + if (hdr->FC_codec) hdr->FC_codec->free(hdr->FC_codec); + if (hdr->FP_codec) hdr->FP_codec->free(hdr->FP_codec); + if (hdr->BS_codec) hdr->BS_codec->free(hdr->BS_codec); + if (hdr->IN_codec) hdr->IN_codec->free(hdr->IN_codec); + if (hdr->SC_codec) hdr->SC_codec->free(hdr->SC_codec); + if (hdr->DL_codec) hdr->DL_codec->free(hdr->DL_codec); + if (hdr->BA_codec) hdr->BA_codec->free(hdr->BA_codec); + if (hdr->MQ_codec) hdr->MQ_codec->free(hdr->MQ_codec); + if (hdr->RN_codec) hdr->RN_codec->free(hdr->RN_codec); + if (hdr->QS_codec) hdr->QS_codec->free(hdr->QS_codec); + if (hdr->Qs_codec) hdr->Qs_codec->free(hdr->Qs_codec); + if (hdr->RI_codec) hdr->RI_codec->free(hdr->RI_codec); + if (hdr->RS_codec) hdr->RS_codec->free(hdr->RS_codec); + if (hdr->PD_codec) hdr->PD_codec->free(hdr->PD_codec); + if (hdr->HC_codec) hdr->HC_codec->free(hdr->HC_codec); + + if (hdr->TL) + free(hdr->TL); + if (hdr->TD_blk) + cram_free_block(hdr->TD_blk); + if (hdr->TD_hash) + kh_destroy(m_s2i, hdr->TD_hash); + if (hdr->TD_keys) + string_pool_destroy(hdr->TD_keys); + + free(hdr); +} + + +/* ---------------------------------------------------------------------- + * Slices and slice headers + */ + +void cram_free_slice_header(cram_block_slice_hdr *hdr) { + if (!hdr) + return; + + if (hdr->block_content_ids) + free(hdr->block_content_ids); + + free(hdr); + + return; +} + +void cram_free_slice(cram_slice *s) { + if (!s) + return; + + if (s->hdr_block) + cram_free_block(s->hdr_block); + + if (s->block) { + int i; + + if (s->hdr) { + for (i = 0; i < s->hdr->num_blocks; i++) { + cram_free_block(s->block[i]); + } + } + free(s->block); + } + + if (s->block_by_id) + free(s->block_by_id); + + if (s->hdr) + cram_free_slice_header(s->hdr); + + if (s->seqs_blk) + cram_free_block(s->seqs_blk); + + if (s->qual_blk) + cram_free_block(s->qual_blk); + + if (s->name_blk) + cram_free_block(s->name_blk); + + if (s->aux_blk) + cram_free_block(s->aux_blk); + + if (s->base_blk) + cram_free_block(s->base_blk); + + if (s->soft_blk) + cram_free_block(s->soft_blk); + +#ifdef TN_external + if (s->tn_blk) + cram_free_block(s->tn_blk); +#endif + + if (s->cigar) + free(s->cigar); + + if (s->crecs) + free(s->crecs); + + if (s->features) + free(s->features); + +#ifndef TN_external + if (s->TN) + free(s->TN); +#endif + + if (s->pair_keys) + string_pool_destroy(s->pair_keys); + + if (s->pair) + kh_destroy(m_s2i, s->pair); + + free(s); +} + +/* + * Creates a new empty slice in memory, for subsequent writing to + * disk. + * + * Returns cram_slice ptr on success + * NULL on failure + */ +cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { + cram_slice *s = calloc(1, sizeof(*s)); + if (!s) + return NULL; + + if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr)))) + goto err; + s->hdr->content_type = type; + + s->hdr_block = NULL; + s->block = NULL; + s->block_by_id = NULL; + s->last_apos = 0; + s->id = 0; + if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; + s->cigar = NULL; + s->cigar_alloc = 0; + s->ncigar = 0; + + if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; + if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err; + if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err; + if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err; + if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err; + if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err; +#ifdef TN_external + if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err; +#endif + + s->features = NULL; + s->nfeatures = s->afeatures = 0; + +#ifndef TN_external + s->TN = NULL; + s->nTN = s->aTN = 0; +#endif + + // Volatile keys as we do realloc in dstring + if (!(s->pair_keys = string_pool_create(8192))) goto err; + if (!(s->pair = kh_init(m_s2i))) goto err; + +#ifdef BA_external + s->BA_len = 0; +#endif + + return s; + + err: + if (s) + cram_free_slice(s); + + return NULL; +} + +/* + * Loads an entire slice. + * FIXME: In 1.0 the native unit of slices within CRAM is broken + * as slices contain references to objects in other slices. + * To work around this while keeping the slice oriented outer loop + * we read all slices and stitch them together into a fake large + * slice instead. + * + * Returns cram_slice ptr on success + * NULL on failure + */ +cram_slice *cram_read_slice(cram_fd *fd) { + cram_block *b = cram_read_block(fd); + cram_slice *s = calloc(1, sizeof(*s)); + int i, n, max_id, min_id; + + if (!b || !s) + goto err; + + s->hdr_block = b; + switch (b->content_type) { + case MAPPED_SLICE: + case UNMAPPED_SLICE: + if (!(s->hdr = cram_decode_slice_header(fd, b))) + goto err; + break; + + default: + fprintf(stderr, "Unexpected block of type %s\n", + cram_content_type2str(b->content_type)); + goto err; + } + + s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block)); + if (!s->block) + goto err; + + for (max_id = i = 0, min_id = INT_MAX; i < n; i++) { + if (!(s->block[i] = cram_read_block(fd))) + goto err; + + if (s->block[i]->content_type == EXTERNAL) { + if (max_id < s->block[i]->content_id) + max_id = s->block[i]->content_id; + if (min_id > s->block[i]->content_id) + min_id = s->block[i]->content_id; + } + } + if (min_id >= 0 && max_id < 1024) { + if (!(s->block_by_id = calloc(1024, sizeof(s->block[0])))) + goto err; + + for (i = 0; i < n; i++) { + if (s->block[i]->content_type != EXTERNAL) + continue; + s->block_by_id[s->block[i]->content_id] = s->block[i]; + } + } + + /* Initialise encoding/decoding tables */ + s->cigar = NULL; + s->cigar_alloc = 0; + s->ncigar = 0; + + if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; + if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err; + if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err; + if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err; + if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err; + if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err; +#ifdef TN_external + if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err; +#endif + + + s->crecs = NULL; + + s->last_apos = s->hdr->ref_seq_start; + + s->id = fd->slice_num++; + + return s; + + err: + if (b) + cram_free_block(b); + if (s) { + s->hdr_block = NULL; + cram_free_slice(s); + } + return NULL; +} + + +/* ---------------------------------------------------------------------- + * CRAM file definition (header) + */ + +/* + * Reads a CRAM file definition structure. + * Returns file_def ptr on success + * NULL on failure + */ +cram_file_def *cram_read_file_def(cram_fd *fd) { + cram_file_def *def = malloc(sizeof(*def)); + if (!def) + return NULL; + + if (26 != hread(fd->fp, &def->magic[0], 26)) { + free(def); + return NULL; + } + + if (memcmp(def->magic, "CRAM", 4) != 0) { + free(def); + return NULL; + } + + if (def->major_version > 2) { + fprintf(stderr, "CRAM version number mismatch\n" + "Expected 1.x or 2.x, got %d.%d\n", + def->major_version, def->minor_version); + free(def); + return NULL; + } + + fd->first_container += 26; + fd->last_slice = 0; + + return def; +} + +/* + * Writes a cram_file_def structure to cram_fd. + * Returns 0 on success + * -1 on failure + */ +int cram_write_file_def(cram_fd *fd, cram_file_def *def) { + return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1; +} + +void cram_free_file_def(cram_file_def *def) { + if (def) free(def); +} + +/* ---------------------------------------------------------------------- + * SAM header I/O + */ + + +/* + * Reads the SAM header from the first CRAM data block. + * Also performs minimal parsing to extract read-group + * and sample information. + + * Returns SAM hdr ptr on success + * NULL on failure + */ +SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { + int32_t header_len; + char *header; + SAM_hdr *hdr; + + /* 1.1 onwards stores the header in the first block of a container */ + if (fd->version == CRAM_1_VERS) { + /* Length */ + if (-1 == int32_decode(fd, &header_len)) + return NULL; + + /* Alloc and read */ + if (NULL == (header = malloc(header_len+1))) + return NULL; + + *header = 0; + if (header_len != hread(fd->fp, header, header_len)) + return NULL; + + fd->first_container += 4 + header_len; + } else { + cram_container *c = cram_read_container(fd); + cram_block *b; + int i, len; + + if (!c) + return NULL; + + if (c->num_blocks < 1) { + cram_free_container(c); + return NULL; + } + + if (!(b = cram_read_block(fd))) { + cram_free_container(c); + return NULL; + } + + len = b->comp_size + 2 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); + + /* Extract header from 1st block */ + if (-1 == int32_get(b, &header_len) || + b->uncomp_size - 4 < header_len) { + cram_free_container(c); + cram_free_block(b); + return NULL; + } + if (NULL == (header = malloc(header_len))) { + cram_free_container(c); + cram_free_block(b); + return NULL; + } + memcpy(header, BLOCK_END(b), header_len); + cram_free_block(b); + + /* Consume any remaining blocks */ + for (i = 1; i < c->num_blocks; i++) { + if (!(b = cram_read_block(fd))) { + cram_free_container(c); + return NULL; + } + len += b->comp_size + 2 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); + cram_free_block(b); + } + + if (c->length && c->length > len) { + // Consume padding + char *pads = malloc(c->length - len); + if (!pads) { + cram_free_container(c); + return NULL; + } + + if (c->length - len != hread(fd->fp, pads, c->length - len)) { + cram_free_container(c); + return NULL; + } + free(pads); + } + + cram_free_container(c); + } + + /* Parse */ +#ifdef SAMTOOLS + hdr = sam_hdr_parse_(header, header_len); +#else + hdr = sam_hdr_parse(header, header_len); +#endif + free(header); + + return hdr; +} + +/* + * Converts 'in' to a full pathname to store in out. + * Out must be at least PATH_MAX bytes long. + */ +static void full_path(char *out, char *in) { + if (*in == '/') { + strncpy(out, in, PATH_MAX); + out[PATH_MAX-1] = 0; + } else { + int len; + + // unable to get dir or out+in is too long + if (!getcwd(out, PATH_MAX) || + (len = strlen(out))+1+strlen(in) >= PATH_MAX) { + strncpy(out, in, PATH_MAX); + out[PATH_MAX-1] = 0; + return; + } + + sprintf(out+len, "/%.*s", PATH_MAX - len, in); + + // FIXME: cope with `pwd`/../../../foo.fa ? + } +} + +/* + * Writes a CRAM SAM header. + * Returns 0 on success + * -1 on failure + */ +//#define BLANK_BLOCK +//#define PADDED_CONTAINER +#define PADDED_BLOCK +int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { + int header_len; + + /* 1.0 requires and UNKNOWN read-group */ + if (fd->version == CRAM_1_VERS) { + if (!sam_hdr_find_rg(hdr, "UNKNOWN")) + if (sam_hdr_add(hdr, "RG", + "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) + return -1; + } + + /* Fix M5 strings */ + if (fd->refs && !fd->no_ref) { + int i; + for (i = 0; i < hdr->nref; i++) { + SAM_hdr_type *ty; + char *ref; + + if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name))) + return -1; + + if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) { + char unsigned buf[16], buf2[33]; + int j, rlen; + MD5_CTX md5; + + if (!fd->refs || + !fd->refs->ref_id || + !fd->refs->ref_id[i]) { + return -1; + } + rlen = fd->refs->ref_id[i]->length; + MD5_Init(&md5); + ref = cram_get_ref(fd, i, 1, rlen); + if (NULL == ref) return -1; + rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ + MD5_Update(&md5, ref, rlen); + MD5_Final(buf, &md5); + cram_ref_decr(fd->refs, i); + + for (j = 0; j < 16; j++) { + buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4]; + buf2[j*2+1] = "0123456789abcdef"[buf[j]&15]; + } + buf2[32] = 0; + if (sam_hdr_update(hdr, ty, "M5", buf2, NULL)) + return -1; + } + + if (fd->ref_fn) { + char ref_fn[PATH_MAX]; + full_path(ref_fn, fd->ref_fn); + if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL)) + return -1; + } + } + } + + if (sam_hdr_rebuild(hdr)) + return -1; + + /* Length */ + header_len = sam_hdr_length(hdr); + if (fd->version == CRAM_1_VERS) { + if (-1 == int32_encode(fd, header_len)) + return -1; + + /* Text data */ + if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) + return -1; + } else { + /* Create a block inside a container */ + cram_block *b = cram_new_block(FILE_HEADER, 0); + cram_container *c = cram_new_container(0, 0); + int padded_length; + char *pads; + + if (!b || !c) { + if (b) cram_free_block(b); + if (c) cram_free_container(c); + return -1; + } + + int32_put(b, header_len); + BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); + BLOCK_UPLEN(b); + +#ifndef BLANK_BLOCK + c->num_blocks = 1; + c->num_landmarks = 1; + if (!(c->landmark = malloc(sizeof(*c->landmark)))) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + c->landmark[0] = 0; + + c->length = b->uncomp_size + 2 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); +#else + c->length = b->uncomp_size + 2 + + itf8_size(b->content_id) + + itf8_size(b->uncomp_size) + + itf8_size(b->comp_size); + + c->num_blocks = 2; + c->num_landmarks = 2; + if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) + return -1; + c->landmark[0] = 0; + c->landmark[1] = c->length; + + c->length *= 2; +#endif + +#ifdef PADDED_BLOCK + padded_length = MAX(c->length*1.5, 10000) - c->length; + c->length += padded_length; + if (NULL == (pads = calloc(1, padded_length))) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + BLOCK_APPEND(b, pads, padded_length); + BLOCK_UPLEN(b); + free(pads); +#endif + +#ifdef PADDED_CONTAINER + padded_length = MAX(c->length*2, 10000) - c->length; + c->length += padded_length; +#endif + + if (-1 == cram_write_container(fd, c)) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + + // Keep it uncompressed + if (-1 == cram_write_block(fd, b)) { + cram_free_block(b); + cram_free_container(c); + return -1; + } + +#ifdef BLANK_BLOCK + if (-1 == cram_write_block(fd, b)) { + cram_free_block(b); + cram_free_container(c); + return -1; + } +#endif + + cram_free_block(b); + cram_free_container(c); + +#ifdef PADDED_CONTAINER + // Write out padding to allow for in-line SAM header editing + if (NULL == (pads = calloc(1, padded_length))) + return -1; + if (padded_length != hwrite(fd->fp, pads, padded_length)) + return -1; + free(pads); +#endif + } + + if (-1 == refs_from_header(fd->refs, fd, fd->header)) + return -1; + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + + if (0 != hflush(fd->fp)) + return -1; + + RP("=== Finishing saving header ===\n"); + + return 0; +} + +/* ---------------------------------------------------------------------- + * The top-level cram opening, closing and option handling + */ + +/* + * Initialises the lookup tables. These could be global statics, but they're + * clumsy to setup in a multi-threaded environment unless we generate + * verbatim code and include that. + */ +static void cram_init_tables(cram_fd *fd) { + int i; + + memset(fd->L1, 4, 256); + fd->L1['A'] = 0; fd->L1['a'] = 0; + fd->L1['C'] = 1; fd->L1['c'] = 1; + fd->L1['G'] = 2; fd->L1['g'] = 2; + fd->L1['T'] = 3; fd->L1['t'] = 3; + + memset(fd->L2, 5, 256); + fd->L2['A'] = 0; fd->L2['a'] = 0; + fd->L2['C'] = 1; fd->L2['c'] = 1; + fd->L2['G'] = 2; fd->L2['g'] = 2; + fd->L2['T'] = 3; fd->L2['t'] = 3; + fd->L2['N'] = 4; fd->L2['n'] = 4; + + if (fd->version == CRAM_1_VERS) { + for (i = 0; i < 0x200; i++) { + int f = 0; + + if (i & CRAM_FPAIRED) f |= BAM_FPAIRED; + if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR; + if (i & CRAM_FUNMAP) f |= BAM_FUNMAP; + if (i & CRAM_FREVERSE) f |= BAM_FREVERSE; + if (i & CRAM_FREAD1) f |= BAM_FREAD1; + if (i & CRAM_FREAD2) f |= BAM_FREAD2; + if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY; + if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL; + if (i & CRAM_FDUP) f |= BAM_FDUP; + + fd->bam_flag_swap[i] = f; + } + + for (i = 0; i < 0x1000; i++) { + int g = 0; + + if (i & BAM_FPAIRED) g |= CRAM_FPAIRED; + if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR; + if (i & BAM_FUNMAP) g |= CRAM_FUNMAP; + if (i & BAM_FREVERSE) g |= CRAM_FREVERSE; + if (i & BAM_FREAD1) g |= CRAM_FREAD1; + if (i & BAM_FREAD2) g |= CRAM_FREAD2; + if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY; + if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL; + if (i & BAM_FDUP) g |= CRAM_FDUP; + + fd->cram_flag_swap[i] = g; + } + } else { + /* NOP */ + for (i = 0; i < 0x1000; i++) + fd->bam_flag_swap[i] = i; + for (i = 0; i < 0x1000; i++) + fd->cram_flag_swap[i] = i; + } + + memset(fd->cram_sub_matrix, 4, 32*32); + for (i = 0; i < 32; i++) { + fd->cram_sub_matrix[i]['A'&0x1f]=0; + fd->cram_sub_matrix[i]['C'&0x1f]=1; + fd->cram_sub_matrix[i]['G'&0x1f]=2; + fd->cram_sub_matrix[i]['T'&0x1f]=3; + fd->cram_sub_matrix[i]['N'&0x1f]=4; + } + for (i = 0; i < 20; i+=4) { + int j; + for (j = 0; j < 20; j++) { + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; + } + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; + fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; + } +} + +// Default version numbers for CRAM +static int major_version = 2; +static int minor_version = 1; + +/* + * Opens a CRAM file for read (mode "rb") or write ("wb"). + * The filename may be "-" to indicate stdin or stdout. + * + * Returns file handle on success + * NULL on failure. + */ +cram_fd *cram_open(const char *filename, const char *mode) { + cram_FILE *fp; + cram_fd *fd; + char fmode[3]= { mode[0], '\0', '\0' }; + + if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) { + fmode[1] = 'b'; + } + +#ifdef SAMTOOLS + fp = hopen(filename, fmode); +#else + if (strcmp(filename, "-") == 0) { + fp = (*fmode == 'r') ? stdin : stdout; + } else { + fp = fopen(filename, fmode); + } +#endif + if (!fp) + return NULL; + + fd = cram_dopen(fp, filename, mode); + if (!fd) + hclose_abruptly(fp); + + return fd; +} + +/* Opens an existing stream for reading or writing. + * + * Returns file handle on success; + * NULL on failure. + * + * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how + * cram_structs.h has been configured. + */ +cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) { + int i; + char *cp; + cram_fd *fd = calloc(1, sizeof(*fd)); + if (!fd) + return NULL; + + fd->level = 5; + if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9') + fd->level = mode[2] - '0'; + + fd->fp = fp; + fd->mode = *mode; + fd->first_container = 0; + + if (fd->mode == 'r') { + /* Reader */ + + if (!(fd->file_def = cram_read_file_def(fd))) + goto err; + + fd->version = fd->file_def->major_version * 100 + + fd->file_def->minor_version; + + if (!(fd->header = cram_read_SAM_hdr(fd))) + goto err; + + } else { + /* Writer */ + cram_file_def def; + + def.magic[0] = 'C'; + def.magic[1] = 'R'; + def.magic[2] = 'A'; + def.magic[3] = 'M'; + def.major_version = major_version; + def.minor_version = minor_version; + memset(def.file_id, 0, 20); + strncpy(def.file_id, filename, 20); + if (0 != cram_write_file_def(fd, &def)) + goto err; + + fd->version = def.major_version * 100 + def.minor_version; + + /* SAM header written later */ + } + + cram_init_tables(fd); + + fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); + if (!fd->prefix) + goto err; + fd->slice_num = 0; + fd->first_base = fd->last_base = -1; + fd->record_counter = 0; + + fd->ctr = NULL; + fd->refs = refs_create(); + if (!fd->refs) + goto err; + fd->ref_id = -2; + fd->ref = NULL; + + fd->decode_md = 0; + fd->verbose = 0; + fd->seqs_per_slice = SEQS_PER_SLICE; + fd->slices_per_container = SLICE_PER_CNT; + fd->embed_ref = 0; + fd->no_ref = 0; + fd->ignore_md5 = 0; + fd->use_bz2 = 0; + fd->multi_seq = 0; + fd->unsorted = 0; + fd->shared_ref = 0; + + fd->index = NULL; + fd->own_pool = 0; + fd->pool = NULL; + fd->rqueue = NULL; + fd->job_pending = NULL; + fd->ooc = 0; + + for (i = 0; i < 7; i++) + fd->m[i] = cram_new_metrics(); + + fd->range.refid = -2; // no ref. + fd->eof = 1; // See samtools issue #150 + fd->ref_fn = NULL; + + fd->bl = NULL; + + /* Initialise dummy refs from the @SQ headers */ + if (-1 == refs_from_header(fd->refs, fd, fd->header)) + goto err; + + return fd; + + err: + if (fd) + free(fd); + + return NULL; +} + +/* + * Seek within a CRAM file. + * + * Returns 0 on success + * -1 on failure + */ +int cram_seek(cram_fd *fd, off_t offset, int whence) { + char buf[65536]; + + if (hseek(fd->fp, offset, whence) >= 0) + return 0; + + if (!(whence == SEEK_CUR && offset >= 0)) + return -1; + + /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */ + while (offset > 0) { + int len = MIN(65536, offset); + if (len != hread(fd->fp, buf, len)) + return -1; + offset -= len; + } + + return 0; +} + +/* + * Flushes a CRAM file. + * Useful for when writing to stdout without wishing to close the stream. + * + * Returns 0 on success + * -1 on failure + */ +int cram_flush(cram_fd *fd) { + if (!fd) + return -1; + + if (fd->mode == 'w' && fd->ctr) { + if(fd->ctr->slice) + fd->ctr->curr_slice++; + if (-1 == cram_flush_container_mt(fd, fd->ctr)) + return -1; + } + + return 0; +} + +/* + * Closes a CRAM file. + * Returns 0 on success + * -1 on failure + */ +int cram_close(cram_fd *fd) { + spare_bams *bl, *next; + int i; + + if (!fd) + return -1; + + if (fd->mode == 'w' && fd->ctr) { + if(fd->ctr->slice) + fd->ctr->curr_slice++; + if (-1 == cram_flush_container_mt(fd, fd->ctr)) + return -1; + } + + if (fd->pool) { + t_pool_flush(fd->pool); + + if (0 != cram_flush_result(fd)) + return -1; + + pthread_mutex_destroy(&fd->metrics_lock); + pthread_mutex_destroy(&fd->ref_lock); + pthread_mutex_destroy(&fd->bam_list_lock); + + fd->ctr = NULL; // prevent double freeing + + //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue); + + t_results_queue_destroy(fd->rqueue); + } + + if (fd->mode == 'w') { + /* Write EOF block */ + if (30 != hwrite(fd->fp, "\x0b\x00\x00\x00\xff\xff\xff\xff" + "\xff\xe0\x45\x4f\x46\x00\x00\x00" + "\x00\x01\x00\x00\x01\x00\x06\x06" + "\x01\x00\x01\x00\x01\x00", 30)) + return -1; + +// if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff" +// "\xff\xe0\x45\x4f\x46\x00\x00\x00" +// "\x00\x00\x00", 19, 1, fd->fp)) +// return -1; + } + + for (bl = fd->bl; bl; bl = next) { + int i, max_rec = fd->seqs_per_slice * fd->slices_per_container; + + next = bl->next; + for (i = 0; i < max_rec; i++) { + if (bl->bams[i]) + bam_free(bl->bams[i]); + } + free(bl->bams); + free(bl); + } + + if (paranoid_hclose(fd->fp) != 0) + return -1; + + if (fd->file_def) + cram_free_file_def(fd->file_def); + + if (fd->header) + sam_hdr_free(fd->header); + + free(fd->prefix); + + if (fd->ctr) + cram_free_container(fd->ctr); + + if (fd->refs) + refs_free(fd->refs); + if (fd->ref_free) + free(fd->ref_free); + + for (i = 0; i < 7; i++) + if (fd->m[i]) + free(fd->m[i]); + + if (fd->index) + cram_index_free(fd); + + if (fd->own_pool && fd->pool) + t_pool_destroy(fd->pool, 0); + + free(fd); + return 0; +} + +/* + * Returns 1 if we hit an EOF while reading. + */ +int cram_eof(cram_fd *fd) { + return fd->eof; +} + + +/* + * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. + * Use this immediately after opening. + * + * Returns 0 on success + * -1 on failure + */ +int cram_set_option(cram_fd *fd, enum cram_option opt, ...) { + int r; + va_list args; + + va_start(args, opt); + r = cram_set_voption(fd, opt, args); + va_end(args); + + return r; +} + +/* + * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. + * Use this immediately after opening. + * + * Returns 0 on success + * -1 on failure + */ +int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { + refs_t *refs; + + switch (opt) { + case CRAM_OPT_DECODE_MD: + fd->decode_md = va_arg(args, int); + break; + + case CRAM_OPT_PREFIX: + if (fd->prefix) + free(fd->prefix); + if (!(fd->prefix = strdup(va_arg(args, char *)))) + return -1; + break; + + case CRAM_OPT_VERBOSITY: + fd->verbose = va_arg(args, int); + break; + + case CRAM_OPT_SEQS_PER_SLICE: + fd->seqs_per_slice = va_arg(args, int); + break; + + case CRAM_OPT_SLICES_PER_CONTAINER: + fd->slices_per_container = va_arg(args, int); + break; + + case CRAM_OPT_EMBED_REF: + fd->embed_ref = va_arg(args, int); + break; + + case CRAM_OPT_NO_REF: + fd->no_ref = va_arg(args, int); + break; + + case CRAM_OPT_IGNORE_MD5: + fd->ignore_md5 = va_arg(args, int); + break; + + case CRAM_OPT_USE_BZIP2: + fd->use_bz2 = va_arg(args, int); + break; + + case CRAM_OPT_SHARED_REF: + fd->shared_ref = 1; + refs = va_arg(args, refs_t *); + if (refs != fd->refs) { + if (fd->refs) + refs_free(fd->refs); + fd->refs = refs; + fd->refs->count++; + } + break; + + case CRAM_OPT_RANGE: + fd->range = *va_arg(args, cram_range *); + return cram_seek_to_refpos(fd, &fd->range); + + case CRAM_OPT_REFERENCE: + return cram_load_reference(fd, va_arg(args, char *)); + + case CRAM_OPT_VERSION: { + int major, minor; + char *s = va_arg(args, char *); + if (2 != sscanf(s, "%d.%d", &major, &minor)) { + fprintf(stderr, "Malformed version string %s\n", s); + return -1; + } + if (!((major == 1 && minor == 0) || + (major == 2 && (minor == 0 || minor == 1)) || + (major == 3 && minor == 0))) { + fprintf(stderr, "Unknown version string; " + "use 1.0, 2.0, 2.1 or 3.0\n"); + return -1; + } + break; + } + + case CRAM_OPT_MULTI_SEQ_PER_SLICE: + fd->multi_seq = va_arg(args, int); + break; + + case CRAM_OPT_NTHREADS: { + int nthreads = va_arg(args, int); + if (nthreads > 1) { + if (!(fd->pool = t_pool_init(nthreads*2, nthreads))) + return -1; + + fd->rqueue = t_results_queue_init(); + pthread_mutex_init(&fd->metrics_lock, NULL); + pthread_mutex_init(&fd->ref_lock, NULL); + pthread_mutex_init(&fd->bam_list_lock, NULL); + fd->shared_ref = 1; + fd->own_pool = 1; + } + break; + } + + case CRAM_OPT_THREAD_POOL: + fd->pool = va_arg(args, t_pool *); + if (fd->pool) { + fd->rqueue = t_results_queue_init(); + pthread_mutex_init(&fd->metrics_lock, NULL); + pthread_mutex_init(&fd->ref_lock, NULL); + pthread_mutex_init(&fd->bam_list_lock, NULL); + } + fd->shared_ref = 1; // Needed to avoid clobbering ref between threads + fd->own_pool = 0; + + //fd->qsize = 1; + //fd->decoded = calloc(fd->qsize, sizeof(cram_container *)); + //t_pool_dispatch(fd->pool, cram_decoder_thread, fd); + break; + + default: + fprintf(stderr, "Unknown CRAM option code %d\n", opt); + return -1; + } + + return 0; +} diff --git a/star-sys/STAR/source/htslib/cram/cram_io.h b/star-sys/STAR/source/htslib/cram/cram_io.h new file mode 100644 index 0000000..165dc5e --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_io.h @@ -0,0 +1,532 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * Include cram.h instead. + * + * This is an internal part of the CRAM system and is automatically included + * when you #include cram.h. + * + * Implements the low level CRAM I/O primitives. + * This includes basic data types such as byte, int, ITF-8, + * maps, bitwise I/O, etc. + */ + +#ifndef _CRAM_IO_H_ +#define _CRAM_IO_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define ITF8_MACROS + +#include +#include + +/**@{ ---------------------------------------------------------------------- + * ITF8 encoding and decoding. + * + * Also see the itf8_get and itf8_put macros. + */ + +/*! INTERNAL: Converts two characters into an integer for use in switch{} */ +#define CRAM_KEY(a,b) (((a)<<8)|((b))) + +/*! Reads an integer in ITF-8 encoding from 'fd' and stores it in + * *val. + * + * @return + * Returns the number of bytes read on success; + * -1 on failure + */ +int itf8_decode(cram_fd *fd, int32_t *val); + +#ifndef ITF8_MACROS +/*! Reads an integer in ITF-8 encoding from 'cp' and stores it in + * *val. + * + * @return + * Returns the number of bytes read on success; + * -1 on failure + */ +int itf8_get(char *cp, int32_t *val_p); + +/*! Stores a value to memory in ITF-8 format. + * + * @return + * Returns the number of bytes required to store the number. + * This is a maximum of 5 bytes. + */ +int itf8_put(char *cp, int32_t val); + +#else + +/* + * Macro implementations of the above + */ +#define itf8_get(c,v) (((uc)(c)[0]<0x80)?(*(v)=(uc)(c)[0],1):(((uc)(c)[0]<0xc0)?(*(v)=(((uc)(c)[0]<<8)|(uc)(c)[1])&0x3fff,2):(((uc)(c)[0]<0xe0)?(*(v)=(((uc)(c)[0]<<16)|((uc)(c)[1]<<8)|(uc)(c)[2])&0x1fffff,3):(((uc)(c)[0]<0xf0)?(*(v)=(((uc)(c)[0]<<24)|((uc)(c)[1]<<16)|((uc)(c)[2]<<8)|(uc)(c)[3])&0x0fffffff,4):(*(v)=(((uc)(c)[0]&0x0f)<<28)|((uc)(c)[1]<<20)|((uc)(c)[2]<<12)|((uc)(c)[3]<<4)|((uc)(c)[4]&0x0f),5))))) + +#define itf8_put(c,v) ((!((v)&~0x7f))?((c)[0]=(v),1):(!((v)&~0x3fff))?((c)[0]=((v)>>8)|0x80,(c)[1]=(v)&0xff,2):(!((v)&~0x1fffff))?((c)[0]=((v)>>16)|0xc0,(c)[1]=((v)>>8)&0xff,(c)[2]=(v)&0xff,3):(!((v)&~0xfffffff))?((c)[0]=((v)>>24)|0xe0,(c)[1]=((v)>>16)&0xff,(c)[2]=((v)>>8)&0xff,(c)[3]=(v)&0xff,4):((c)[0]=0xf0|(((v)>>28)&0xff),(c)[1]=((v)>>20)&0xff,(c)[2]=((v)>>12)&0xff,(c)[3]=((v)>>4)&0xff,(c)[4]=(v)&0xf,5)) + +#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5) + +#endif + +/*! Pushes a value in ITF8 format onto the end of a block. + * + * This shouldn't be used for high-volume data as it is not the fastest + * method. + * + * @return + * Returns the number of bytes written + */ +int itf8_put_blk(cram_block *blk, int val); + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * CRAM blocks - the dynamically growable data block. We have code to + * create, update, (un)compress and read/write. + * + * These are derived from the deflate_interlaced.c blocks, but with the + * CRAM extension of content types and IDs. + */ + +/*! Allocates a new cram_block structure with a specified content_type and + * id. + * + * @return + * Returns block pointer on success; + * NULL on failure + */ +cram_block *cram_new_block(enum cram_content_type content_type, + int content_id); + +/*! Reads a block from a cram file. + * + * @return + * Returns cram_block pointer on success; + * NULL on failure + */ +cram_block *cram_read_block(cram_fd *fd); + +/*! Writes a CRAM block. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_write_block(cram_fd *fd, cram_block *b); + +/*! Frees a CRAM block, deallocating internal data too. + */ +void cram_free_block(cram_block *b); + +/*! Uncompress a memory block using Zlib. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size); + +/*! Uncompresses a CRAM block, if compressed. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_uncompress_block(cram_block *b); + +/*! Compresses a block. + * + * Compresses a block using one of two different zlib strategies. If we only + * want one choice set strat2 to be -1. + * + * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED + * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is + * significantly faster. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int level, int strat, + int level2, int strat2); + +cram_metrics *cram_new_metrics(void); +char *cram_block_method2str(enum cram_block_method m); +char *cram_content_type2str(enum cram_content_type t); + +/* --- Accessor macros for manipulating blocks on a byte by byte basis --- */ + +/* Block size and data pointer. */ +#define BLOCK_SIZE(b) ((b)->byte) +#define BLOCK_DATA(b) ((b)->data) + +/* Returns the address one past the end of the block */ +#define BLOCK_END(b) (&(b)->data[(b)->byte]) + +/* Request block to be at least 'l' bytes long */ +#define BLOCK_RESIZE(b,l) \ + do { \ + while((b)->alloc <= (l)) { \ + (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024; \ + (b)->data = realloc((b)->data, (b)->alloc); \ + } \ + } while(0) + +/* Ensure the block can hold at least another 'l' bytes */ +#define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l)) + +/* Append string 's' of length 'l' */ +#define BLOCK_APPEND(b,s,l) \ + do { \ + BLOCK_GROW((b),(l)); \ + memcpy(BLOCK_END((b)), (s), (l)); \ + BLOCK_SIZE((b)) += (l); \ + } while (0) + +/* Append as single character 'c' */ +#define BLOCK_APPEND_CHAR(b,c) \ + do { \ + BLOCK_GROW((b),1); \ + (b)->data[(b)->byte++] = (c); \ + } while (0) + +/* Append via sprintf with 1 arg */ +#define BLOCK_APPENDF_1(b,buf,fmt, a1) \ + do { \ + int l = sprintf((buf), (fmt), (a1)); \ + BLOCK_APPEND((b), (buf), l); \ + } while (0) + +/* Append via sprintf with 2 args */ +#define BLOCK_APPENDF_2(b,buf,fmt, a1,a2) \ + do { \ + int l = sprintf((buf), (fmt), (a1), (a2)); \ + BLOCK_APPEND((b), (buf), l); \ + } while (0) + +#define BLOCK_UPLEN(b) \ + (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b)) + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * Reference sequence handling + */ + +/*! Loads a reference set from fn and stores in the cram_fd. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_load_reference(cram_fd *fd, char *fn); + +/*! Generates a lookup table in refs based on the SQ headers in SAM_hdr. + * + * Indexes references by the order they appear in a BAM file. This may not + * necessarily be the same order they appear in the fasta reference file. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int refs2id(refs_t *r, SAM_hdr *bfd); + +void refs_free(refs_t *r); + +/*! Returns a portion of a reference sequence from start to end inclusive. + * + * The returned pointer is owned by the cram_file fd and should not be freed + * by the caller. It is valid only until the next cram_get_ref is called + * with the same fd parameter (so is thread-safe if given multiple files). + * + * To return the entire reference sequence, specify start as 1 and end + * as 0. + * + * @return + * Returns reference on success; + * NULL on failure + */ +char *cram_get_ref(cram_fd *fd, int id, int start, int end); +void cram_ref_incr(refs_t *r, int id); +void cram_ref_decr(refs_t *r, int id); +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * Containers + */ + +/*! Creates a new container, specifying the maximum number of slices + * and records permitted. + * + * @return + * Returns cram_container ptr on success; + * NULL on failure + */ +cram_container *cram_new_container(int nrec, int nslice); +void cram_free_container(cram_container *c); + +/*! Reads a container header. + * + * @return + * Returns cram_container on success; + * NULL on failure or no container left (fd->err == 0). + */ +cram_container *cram_read_container(cram_fd *fd); + +/*! Writes a container structure. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_write_container(cram_fd *fd, cram_container *h); + +/*! Flushes a container to disk. + * + * Flushes a completely or partially full container to disk, writing + * container structure, header and blocks. This also calls the encoder + * functions. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_flush_container(cram_fd *fd, cram_container *c); +int cram_flush_container_mt(cram_fd *fd, cram_container *c); + + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * Compression headers; the first part of the container + */ + +/*! Creates a new blank container compression header + * + * @return + * Returns header ptr on success; + * NULL on failure + */ +cram_block_compression_hdr *cram_new_compression_header(void); + +/*! Frees a cram_block_compression_hdr */ +void cram_free_compression_header(cram_block_compression_hdr *hdr); + + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * Slices and slice headers + */ + +/*! Frees a slice header */ +void cram_free_slice_header(cram_block_slice_hdr *hdr); + +/*! Frees a slice */ +void cram_free_slice(cram_slice *s); + +/*! Creates a new empty slice in memory, for subsequent writing to + * disk. + * + * @return + * Returns cram_slice ptr on success; + * NULL on failure + */ +cram_slice *cram_new_slice(enum cram_content_type type, int nrecs); + +/*! Loads an entire slice. + * + * FIXME: In 1.0 the native unit of slices within CRAM is broken + * as slices contain references to objects in other slices. + * To work around this while keeping the slice oriented outer loop + * we read all slices and stitch them together into a fake large + * slice instead. + * + * @return + * Returns cram_slice ptr on success; + * NULL on failure + */ +cram_slice *cram_read_slice(cram_fd *fd); + + + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * CRAM file definition (header) + */ + +/*! Reads a CRAM file definition structure. + * + * @return + * Returns file_def ptr on success; + * NULL on failure + */ +cram_file_def *cram_read_file_def(cram_fd *fd); + +/*! Writes a cram_file_def structure to cram_fd. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_write_file_def(cram_fd *fd, cram_file_def *def); + +/*! Frees a cram_file_def structure. */ +void cram_free_file_def(cram_file_def *def); + + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * SAM header I/O + */ + +/*! Reads the SAM header from the first CRAM data block. + * + * Also performs minimal parsing to extract read-group + * and sample information. + * + * @return + * Returns SAM hdr ptr on success; + * NULL on failure + */ +SAM_hdr *cram_read_SAM_hdr(cram_fd *fd); + +/*! Writes a CRAM SAM header. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr); + + +/**@}*/ +/**@{ ---------------------------------------------------------------------- + * The top-level cram opening, closing and option handling + */ + +/*! Opens a CRAM file for read (mode "rb") or write ("wb"). + * + * The filename may be "-" to indicate stdin or stdout. + * + * @return + * Returns file handle on success; + * NULL on failure. + */ +cram_fd *cram_open(const char *filename, const char *mode); + +/*! Opens an existing stream for reading or writing. + * + * @return + * Returns file handle on success; + * NULL on failure. + * + * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how + * cram_structs.h has been configured. + */ +cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode); + +/*! Closes a CRAM file. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_close(cram_fd *fd); + +/* + * Seek within a CRAM file. + * + * Returns 0 on success + * -1 on failure + */ +int cram_seek(cram_fd *fd, off_t offset, int whence); + +/* + * Flushes a CRAM file. + * Useful for when writing to stdout without wishing to close the stream. + * + * Returns 0 on success + * -1 on failure + */ +int cram_flush(cram_fd *fd); + +/*! Checks for end of file on a cram_fd stream. + * + * @return + * Returns 0 if not at end of file + * 1 if we hit an expected EOF (end of range or EOF block) + * 2 for other EOF (end of stream without EOF block) + */ +int cram_eof(cram_fd *fd); + +/*! Sets options on the cram_fd. + * + * See CRAM_OPT_* definitions in cram_structs.h. + * Use this immediately after opening. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_set_option(cram_fd *fd, enum cram_option opt, ...); + +/*! Sets options on the cram_fd. + * + * See CRAM_OPT_* definitions in cram_structs.h. + * Use this immediately after opening. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args); + +/*! + * Attaches a header to a cram_fd. + * + * This should be used when creating a new cram_fd for writing where + * we have an SAM_hdr already constructed (eg from a file we've read + * in). + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int cram_set_header(cram_fd *fd, SAM_hdr *hdr); + + +#ifdef __cplusplus +} +#endif + +#endif /* _CRAM_IO_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/cram_samtools.c b/star-sys/STAR/source/htslib/cram/cram_samtools.c new file mode 100644 index 0000000..66f2efa --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_samtools.c @@ -0,0 +1,144 @@ +/* +Copyright (c) 2010-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include + +#include "cram/cram.h" +#include "htslib/sam.h" + +/*--------------------------------------------------------------------------- + * Samtools compatibility portion + */ +int bam_construct_seq(bam_seq_t **bp, size_t extra_len, + const char *qname, size_t qname_len, + int flag, + int rname, // Ref ID + int pos, + int end, // aligned start/end coords + int mapq, + uint32_t ncigar, const uint32_t *cigar, + int mrnm, // Mate Ref ID + int mpos, + int isize, + int len, + const char *seq, + const char *qual) { + static const char L[256] = { + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, + 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, + 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 + }; + bam1_t *b = (bam1_t *)*bp; + uint8_t *cp; + int i, bam_len; + + //b->l_aux = extra_len; // we fill this out later + + bam_len = qname_len + 1 + ncigar*4 + (len+1)/2 + len + extra_len; + if (b->m_data < bam_len) { + b->m_data = bam_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + if (!b->data) + return -1; + } + b->l_data = bam_len; + + b->core.tid = rname; + b->core.pos = pos-1; + b->core.bin = bam_reg2bin(pos, end); + b->core.qual = mapq; + b->core.l_qname = qname_len+1; + b->core.flag = flag; + b->core.n_cigar = ncigar; + b->core.l_qseq = len; + b->core.mtid = mrnm; + b->core.mpos = mpos-1; + b->core.isize = isize; + + cp = b->data; + + strncpy((char *)cp, qname, qname_len); + cp[qname_len] = 0; + cp += qname_len+1; + memcpy(cp, cigar, ncigar*4); + cp += ncigar*4; + + for (i = 0; i+1 < len; i+=2) { + *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]]; + } + if (i < len) + *cp++ = L[(uc)seq[i]]<<4; + + memcpy(cp, qual, len); + + return 0; +} + +bam_hdr_t *cram_header_to_bam(SAM_hdr *h) { + int i; + bam_hdr_t *header = bam_hdr_init(); + + header->l_text = ks_len(&h->text); + header->text = malloc(header->l_text+1); + memcpy(header->text, ks_str(&h->text), header->l_text); + header->text[header->l_text] = 0; + + header->n_targets = h->nref; + header->target_name = (char **)calloc(header->n_targets, + sizeof(char *)); + header->target_len = (uint32_t *)calloc(header->n_targets, 4); + + for (i = 0; i < h->nref; i++) { + header->target_name[i] = strdup(h->ref[i].name); + header->target_len[i] = h->ref[i].len; + } + + return header; +} + +SAM_hdr *bam_header_to_cram(bam_hdr_t *h) { + return sam_hdr_parse_(h->text, h->l_text); +} diff --git a/star-sys/STAR/source/htslib/cram/cram_samtools.h b/star-sys/STAR/source/htslib/cram/cram_samtools.h new file mode 100644 index 0000000..f7949d0 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_samtools.h @@ -0,0 +1,97 @@ +/* +Copyright (c) 2010-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_SAMTOOLS_H_ +#define _CRAM_SAMTOOLS_H_ + +/* Samtools compatible API */ +#define bam_blk_size(b) ((b)->l_data) +#define bam_set_blk_size(b,v) ((b)->data_len = (v)) + +#define bam_ref(b) (b)->core.tid +#define bam_pos(b) (b)->core.pos +#define bam_mate_pos(b) (b)->core.mpos +#define bam_mate_ref(b) (b)->core.mtid +#define bam_ins_size(b) (b)->core.isize +#define bam_seq_len(b) (b)->core.l_qseq +#define bam_cigar_len(b) (b)->core.n_cigar +#define bam_flag(b) (b)->core.flag +#define bam_bin(b) (b)->core.bin +#define bam_map_qual(b) (b)->core.qual +#define bam_name_len(b) (b)->core.l_qname +#define bam_name(b) bam_get_qname((b)) +#define bam_qual(b) bam_get_qual((b)) +#define bam_seq(b) bam_get_seq((b)) +#define bam_cigar(b) bam_get_cigar((b)) +#define bam_aux(b) bam_get_aux((b)) + +#define bam_dup(b) bam_copy1(bam_init1(), (b)) + +#define bam_free(b) bam_destroy1((b)) + +#define bam_reg2bin(beg,end) hts_reg2bin((beg),(end),14,5) + +#include "htslib/sam.h" + +enum cigar_op { + BAM_CMATCH_=BAM_CMATCH, + BAM_CINS_=BAM_CINS, + BAM_CDEL_=BAM_CDEL, + BAM_CREF_SKIP_=BAM_CREF_SKIP, + BAM_CSOFT_CLIP_=BAM_CSOFT_CLIP, + BAM_CHARD_CLIP_=BAM_CHARD_CLIP, + BAM_CPAD_=BAM_CPAD, + BAM_CBASE_MATCH=BAM_CEQUAL, + BAM_CBASE_MISMATCH=BAM_CDIFF +}; + +typedef bam1_t bam_seq_t; + +#include "cram/sam_header.h" + +bam_hdr_t *cram_header_to_bam(SAM_hdr *h); +SAM_hdr *bam_header_to_cram(bam_hdr_t *h); + +int bam_construct_seq(bam_seq_t **bp, size_t extra_len, + const char *qname, size_t qname_len, + int flag, + int rname, // Ref ID + int pos, + int end, // aligned start/end coords + int mapq, + uint32_t ncigar, const uint32_t *cigar, + int mrnm, // Mate Ref ID + int mpos, + int isize, + int len, + const char *seq, + const char *qual); + +#endif /* _CRAM_SAMTOOLS_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/cram_stats.c b/star-sys/STAR/source/htslib/cram/cram_stats.c new file mode 100644 index 0000000..18d0605 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_stats.c @@ -0,0 +1,357 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cram/cram.h" +#include "cram/os.h" + +cram_stats *cram_stats_create(void) { + return calloc(1, sizeof(cram_stats)); +} + +void cram_stats_add(cram_stats *st, int32_t val) { + st->nsamp++; + + //assert(val >= 0); + + if (val < MAX_STAT_VAL && val >= 0) { + st->freqs[val]++; + } else { + khint_t k; + int r; + + if (!st->h) { + st->h = kh_init(m_i2i); + } + + k = kh_put(m_i2i, st->h, val, &r); + if (r == 0) + kh_val(st->h, k)++; + else if (r != -1) + kh_val(st->h, k) = 1; + else + ; // FIXME: handle error + } +} + +void cram_stats_del(cram_stats *st, int32_t val) { + st->nsamp--; + + //assert(val >= 0); + + if (val < MAX_STAT_VAL && val >= 0) { + st->freqs[val]--; + assert(st->freqs[val] >= 0); + } else if (st->h) { + khint_t k = kh_get(m_i2i, st->h, val); + + if (k != kh_end(st->h)) { + if (--kh_val(st->h, k) == 0) + kh_del(m_i2i, st->h, k); + } else { + fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); + st->nsamp++; + } + } else { + fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); + st->nsamp++; + } +} + +void cram_stats_dump(cram_stats *st) { + int i; + fprintf(stderr, "cram_stats:\n"); + for (i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); + } + if (st->h) { + khint_t k; + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k)); + } + } +} + +#if 1 +/* Returns the number of bits set in val; it the highest bit used */ +static int nbits(int v) { + static const int MultiplyDeBruijnBitPosition[32] = { + 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, + 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 + }; + + v |= v >> 1; // first up to set all bits 1 after the first 1 */ + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + + // DeBruijn magic to find top bit + return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; +} +#endif + +/* + * Computes entropy from integer frequencies for various encoding methods and + * picks the best encoding. + * + * FIXME: we could reuse some of the code here for the actual encoding + * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. + * + * Returns the best codec to use. + */ +enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { + enum cram_encoding best_encoding = E_NULL; + int best_size = INT_MAX, bits; + int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes; + + //cram_stats_dump(st); + + /* Count number of unique symbols */ + for (nvals = i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (nvals >= vals_alloc) { + vals_alloc = vals_alloc ? vals_alloc*2 : 1024; + vals = realloc(vals, vals_alloc * sizeof(int)); + freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!vals || !freqs) { + if (vals) free(vals); + if (freqs) free(freqs); + return E_HUFFMAN; // Cannot do much else atm + } + } + vals[nvals] = i; + freqs[nvals] = st->freqs[i]; + ntot += freqs[nvals]; + if (max_val < i) max_val = i; + if (min_val > i) min_val = i; + nvals++; + } + if (st->h) { + khint_t k; + int i; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + if (nvals >= vals_alloc) { + vals_alloc = vals_alloc ? vals_alloc*2 : 1024; + vals = realloc(vals, vals_alloc * sizeof(int)); + freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!vals || !freqs) + return E_HUFFMAN; // Cannot do much else atm + } + i = kh_key(st->h, k); + vals[nvals]=i; + freqs[nvals] = kh_val(st->h, k); + ntot += freqs[nvals]; + if (max_val < i) max_val = i; + if (min_val > i) min_val = i; + nvals++; + } + } + + st->nvals = nvals; + assert(ntot == st->nsamp); + + if (nvals <= 1) { + free(vals); + free(freqs); + return E_HUFFMAN; + } + + /* + * Avoid complex stats for now, just do heuristic of HUFFMAN for small + * alphabets and BETA for anything large. + */ + free(vals); free(freqs); + return nvals < 200 ? E_HUFFMAN : E_BETA; + + /* We only support huffman now anyway... */ + //free(vals); free(freqs); return E_HUFFMAN; + + if (fd->verbose > 1) + fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", + min_val, max_val, nvals, ntot); + + /* Theoretical entropy */ + { + double dbits = 0; + for (i = 0; i < nvals; i++) { + dbits += freqs[i] * log((double)freqs[i]/ntot); + } + dbits /= -log(2); + if (fd->verbose > 1) + fprintf(stderr, "Entropy = %f\n", dbits); + } + + /* Beta */ + bits = nbits(max_val - min_val) * ntot; + if (fd->verbose > 1) + fprintf(stderr, "BETA = %d\n", bits); + if (best_size > bits) + best_size = bits, best_encoding = E_BETA; + +#if 0 + /* Unary */ + if (min_val >= 0) { + for (bits = i = 0; i < nvals; i++) + bits += freqs[i]*(vals[i]+1); + if (fd->verbose > 1) + fprintf(stderr, "UNARY = %d\n", bits); + if (best_size > bits) + best_size = bits, best_encoding = E_NULL; //E_UNARY; + } + + /* Gamma */ + for (bits = i = 0; i < nvals; i++) + bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i]; + if (fd->verbose > 1) + fprintf(stderr, "GAMMA = %d\n", bits); + if (best_size > bits) + best_size = bits, best_encoding = E_GAMMA; + + /* Subexponential */ + for (k = 0; k < 10; k++) { + for (bits = i = 0; i < nvals; i++) { + if (vals[i]-min_val < (1<verbose > 1) + fprintf(stderr, "SUBEXP%d = %d\n", k, bits); + if (best_size > bits) + best_size = bits, best_encoding = E_SUBEXP; + } +#endif + + /* byte array len */ + + /* byte array stop */ + + /* External? Guesswork! */ + + /* Huffman */ +// qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs); +// for (i = 0; i < nvals; i++) { +// fprintf(stderr, "%d = %d\n", i, freqs[i]); +// vals[i] = 0; +// } + + /* Grow freqs to 2*freqs, to store sums */ + /* Vals holds link data */ + freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); + codes = calloc(2*nvals, sizeof(*codes)); + if (!freqs || !codes) + return E_HUFFMAN; // Cannot do much else atm + + /* Inefficient, use pointers to form chain so we can insert and maintain + * a sorted list? This is currently O(nvals^2) complexity. + */ + for (;;) { + int low1 = INT_MAX, low2 = INT_MAX; + int ind1 = 0, ind2 = 0; + for (i = 0; i < nvals; i++) { + if (freqs[i] < 0) + continue; + if (low1 > freqs[i]) + low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i; + else if (low2 > freqs[i]) + low2 = freqs[i], ind2 = i; + } + if (low2 == INT_MAX) + break; + + //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n", + // ind1, vals[ind1], ind2, vals[ind2], low1, low2, + // nvals, low1+low2); + + freqs[nvals] = low1 + low2; + codes[ind1] = nvals; + codes[ind2] = nvals; + freqs[ind1] *= -1; + freqs[ind2] *= -1; + nvals++; + } + nvals = nvals/2+1; + + for (i = 0; i < nvals; i++) { + int code_len = 0; + for (k = codes[i]; k; k = codes[k]) + code_len++; + codes[i] = code_len; + freqs[i] *= -1; + //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]); + } + + for (bits = i = 0; i < nvals; i++) { + bits += freqs[i] * codes[i]; + } + if (fd->verbose > 1) + fprintf(stderr, "HUFFMAN = %d\n", bits); + if (best_size >= bits) + best_size = bits, best_encoding = E_HUFFMAN; + free(codes); + + free(vals); + free(freqs); + + return best_encoding; +} + +void cram_stats_free(cram_stats *st) { + if (st->h) + kh_destroy(m_i2i, st->h); + free(st); +} diff --git a/star-sys/STAR/source/htslib/cram/cram_stats.h b/star-sys/STAR/source/htslib/cram/cram_stats.h new file mode 100644 index 0000000..b471e68 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_stats.h @@ -0,0 +1,59 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_STATS_H_ +#define _CRAM_STATS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +cram_stats *cram_stats_create(void); +void cram_stats_add(cram_stats *st, int32_t val); +void cram_stats_del(cram_stats *st, int32_t val); +void cram_stats_dump(cram_stats *st); +void cram_stats_free(cram_stats *st); + +/* + * Computes entropy from integer frequencies for various encoding methods and + * picks the best encoding. + * + * FIXME: we could reuse some of the code here for the actual encoding + * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. + * + * Returns the best codec to use. + */ +enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/cram/cram_structs.h b/star-sys/STAR/source/htslib/cram/cram_structs.h new file mode 100644 index 0000000..6d3f1a1 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/cram_structs.h @@ -0,0 +1,752 @@ +/* +Copyright (c) 2012-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _CRAM_STRUCTS_H_ +#define _CRAM_STRUCTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Defines in-memory structs for the basic file-format objects in the + * CRAM format. + * + * The basic file format is: + * File-def SAM-hdr Container Container ... + * + * Container: + * Service-block data-block data-block ... + * + * Multiple blocks in a container are grouped together as slices, + * also sometimes referred to as landmarks in the spec. + */ + + +#include + +#include "cram/thread_pool.h" + +#ifdef SAMTOOLS +// From within samtools/HTSlib +# include "cram/string_alloc.h" +# include "htslib/khash.h" + +// Generic hash-map integer -> integer +KHASH_MAP_INIT_INT(m_i2i, int) + +// Generic hash-set integer -> (existance) +KHASH_SET_INIT_INT(s_i2i) + +// For brevity +typedef unsigned char uc; + +/* + * A union for the preservation map. Required for khash. + */ +typedef union { + int i; + char *p; +} pmap_t; + +// Generates static functions here which isn't ideal, but we have no way +// currently to declare the kh_map_t structure here without also declaring a +// duplicate in the .c files due to the nature of the KHASH macros. +KHASH_MAP_INIT_STR(map, pmap_t) + +struct hFILE; +typedef struct hFILE cram_FILE; + +#else +// From within io_lib +# include "cram/bam.h" // For BAM header parsing +typedef FILE cram_FILE; +#endif + +#define SEQS_PER_SLICE 10000 +#define SLICE_PER_CNT 1 + +#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" + +#define TN_external +//#define NS_external +#define TS_external +//#define BA_external + +#define MAX_STAT_VAL 1024 +//#define MAX_STAT_VAL 16 +typedef struct { + int freqs[MAX_STAT_VAL]; + khash_t(m_i2i) *h; + int nsamp; // total number of values added + int nvals; // total number of unique values added +} cram_stats; + +/* NB: matches java impl, not the spec */ +enum cram_encoding { + E_NULL = 0, + E_EXTERNAL = 1, + E_GOLOMB = 2, + E_HUFFMAN = 3, + E_BYTE_ARRAY_LEN = 4, + E_BYTE_ARRAY_STOP = 5, + E_BETA = 6, + E_SUBEXP = 7, + E_GOLOMB_RICE = 8, + E_GAMMA = 9 +}; + +enum cram_external_type { + E_INT = 1, + E_LONG = 2, + E_BYTE = 3, + E_BYTE_ARRAY = 4, + E_BYTE_ARRAY_BLOCK = 5, +}; + +/* "File Definition Structure" */ +typedef struct { + char magic[4]; + uint8_t major_version; + uint8_t minor_version; + char file_id[20]; // Filename or SHA1 checksum +} cram_file_def; + +#define CRAM_1_VERS 100 // 1.0 +#define CRAM_2_VERS 200 // 1.1, or 2.0? + +struct cram_slice; + +enum cram_block_method { + BM_ERROR = -1, + RAW = 0, + GZIP = 1, + BZIP2 = 2, +}; + +enum cram_content_type { + CT_ERROR = -1, + FILE_HEADER = 0, + COMPRESSION_HEADER = 1, + MAPPED_SLICE = 2, + UNMAPPED_SLICE = 3, // CRAM_1_VERS only + EXTERNAL = 4, + CORE = 5, +}; + +/* Compression metrics */ +typedef struct { + int m1; + int m2; + int trial; + int next_trial; +} cram_metrics; + +/* Block */ +typedef struct { + enum cram_block_method method, orig_method; + enum cram_content_type content_type; + int32_t content_id; + int32_t comp_size; + int32_t uncomp_size; + int32_t idx; /* offset into data */ + unsigned char *data; + + // For bit I/O + size_t alloc; + size_t byte; + int bit; +} cram_block; + +struct cram_codec; /* defined in cram_codecs.h */ +struct cram_map; + +#define CRAM_MAP_HASH 32 +#define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) + +/* Compression header block */ +typedef struct { + int32_t ref_seq_id; + int32_t ref_seq_start; + int32_t ref_seq_span; + int32_t num_records; + int32_t num_landmarks; + int32_t *landmark; + + /* Flags from preservation map */ + int mapped_qs_included; + int unmapped_qs_included; + int unmapped_placed; + int qs_included; + int read_names_included; + int AP_delta; + // indexed by ref-base and subst. code + char substitution_matrix[5][4]; + + // TD Dictionary as a concatenated block + cram_block *TD_blk; // Tag Dictionary + int nTL; // number of TL entries in TD + unsigned char **TL; // array of size nTL, pointer into TD_blk. + khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices + string_alloc_t *TD_keys; // Pooled keys for TD hash. + + khash_t(map) *preservation_map; + struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; + struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; + + struct cram_codec *BF_codec; // bam bit flags + struct cram_codec *CF_codec; // compression flags + struct cram_codec *RL_codec; // read length + struct cram_codec *AP_codec; // alignment pos + struct cram_codec *RG_codec; // read group + struct cram_codec *MF_codec; // mate flags + struct cram_codec *NS_codec; // next frag ref ID + struct cram_codec *NP_codec; // next frag pos + struct cram_codec *TS_codec; // template size + struct cram_codec *NF_codec; // next frag distance + struct cram_codec *TC_codec; // tag count CRAM_1_VERS + struct cram_codec *TN_codec; // tag name/type CRAM_1_VERS + struct cram_codec *TL_codec; // tag line CRAM_2_VERS + struct cram_codec *FN_codec; // no. features + struct cram_codec *FC_codec; // feature code + struct cram_codec *FP_codec; // feature pos + struct cram_codec *BS_codec; // base subst feature + struct cram_codec *IN_codec; // insertion feature + struct cram_codec *SC_codec; // soft-clip feature + struct cram_codec *DL_codec; // deletion len feature + struct cram_codec *BA_codec; // base feature + struct cram_codec *RS_codec; // ref skip length feature + struct cram_codec *PD_codec; // padding length feature + struct cram_codec *HC_codec; // hard clip length feature + struct cram_codec *MQ_codec; // mapping quality + struct cram_codec *RN_codec; // read names + struct cram_codec *QS_codec; // quality value (single) + struct cram_codec *Qs_codec; // quality values (string) + struct cram_codec *RI_codec; // ref ID + struct cram_codec *TM_codec; // ? + struct cram_codec *TV_codec; // ? + + char *uncomp; // A single block of uncompressed data + size_t uncomp_size, uncomp_alloc; +} cram_block_compression_hdr; + +typedef struct cram_map { + int key; /* 0xe0 + 3 bytes */ + enum cram_encoding encoding; + int offset; /* Offset into a single block of memory */ + int size; /* Size */ + struct cram_codec *codec; + struct cram_map *next; // for noddy internal hash +} cram_map; + +/* Mapped or unmapped slice header block */ +typedef struct { + enum cram_content_type content_type; + int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ + int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ + int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ + int32_t num_records; + int32_t record_counter; + int32_t num_blocks; + int32_t num_content_ids; + int32_t *block_content_ids; + int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ + unsigned char md5[16]; +} cram_block_slice_hdr; + +struct ref_entry; + +/* + * Container. + * + * Conceptually a container is split into slices, and slices into blocks. + * However on disk it's just a list of blocks and we need to query the + * block types to identify the start/end points of the slices. + * + * OR... are landmarks the start/end points of slices? + */ +typedef struct { + int32_t length; + int32_t ref_seq_id; + int32_t ref_seq_start; + int32_t ref_seq_span; + int32_t record_counter; + int64_t num_bases; + int32_t num_records; + int32_t num_blocks; + int32_t num_landmarks; + int32_t *landmark; + + /* Size of container header above */ + size_t offset; + + /* Compression header is always the first block? */ + cram_block_compression_hdr *comp_hdr; + cram_block *comp_hdr_block; + + /* For construction purposes */ + int max_slice, curr_slice; // maximum number of slices + int max_rec, curr_rec; // current and max recs per slice + int max_c_rec, curr_c_rec; // current and max recs per container + int slice_rec; // rec no. for start of this slice + int curr_ref; // current ref ID. -2 for no previous + int last_pos; // last record position + struct cram_slice **slices, *slice; + int pos_sorted; // boolean, 1=>position sorted data + int max_apos; // maximum position, used if pos_sorted==0 + int last_slice; // number of reads in last slice (0 for 1st) + int multi_seq; // true if packing multi seqs per cont/slice + int unsorted; // true is AP_delta is 0. + + /* Copied from fd before encoding, to allow multi-threading */ + int ref_start, first_base, last_base, ref_id, ref_end; + char *ref; + //struct ref_entry *ref; + + /* For multi-threading */ + bam_seq_t **bams; + + /* Statistics for encoding */ + cram_stats *TS_stats; + cram_stats *RG_stats; + cram_stats *FP_stats; + cram_stats *NS_stats; + cram_stats *RN_stats; + cram_stats *CF_stats; + cram_stats *TN_stats; + cram_stats *BA_stats; + cram_stats *TV_stats; + cram_stats *BS_stats; + cram_stats *FC_stats; + cram_stats *BF_stats; + cram_stats *AP_stats; + cram_stats *NF_stats; + cram_stats *MF_stats; + cram_stats *FN_stats; + cram_stats *RL_stats; + cram_stats *DL_stats; + cram_stats *TC_stats; + cram_stats *TL_stats; + cram_stats *MQ_stats; + cram_stats *TM_stats; + cram_stats *QS_stats; + cram_stats *NP_stats; + cram_stats *RI_stats; + cram_stats *RS_stats; + cram_stats *PD_stats; + cram_stats *HC_stats; + + khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map + int *refs_used; // array of frequency of ref seq IDs +} cram_container; + +/* + * A single cram record + */ +typedef struct { + struct cram_slice *s; // Filled out by cram_decode only + + int32_t ref_id; // fixed for all recs in slice? + int32_t flags; // BF + int32_t cram_flags; // CF + int32_t len; // RL + int32_t apos; // AP + int32_t rg; // RG + int32_t name; // RN; idx to s->names_blk + int32_t name_len; + int32_t mate_line; // index to another cram_record + int32_t mate_ref_id; + int32_t mate_pos; // NP + int32_t tlen; // TS + + // Auxiliary data + int32_t ntags; // TC + int32_t aux; // idx to s->aux_blk + int32_t aux_size; // total size of packed ntags in aux_blk +#ifndef TN_external + int32_t TN_idx; // TN; idx to s->TN; +#else + int32_t tn; // idx to s->tn_blk +#endif + int TL; + + int32_t seq; // idx to s->seqs_blk + int32_t qual; // idx to s->qual_blk + int32_t cigar; // idx to s->cigar + int32_t ncigar; + int32_t aend; // alignment end + int32_t mqual; // MQ + + int32_t feature; // idx to s->feature + int32_t nfeature; // number of features + int32_t mate_flags; // MF +} cram_record; + +// Accessor macros as an analogue of the bam ones +#define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) +#define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) +#define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) +#define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) +#define cram_seqi(c,i) (cram_seq((c))[(i)]) +#define cram_name_len(c) ((c)->name_len) +#define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) +#define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) +#define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) + +/* + * A feature is a base difference, used for the sequence reference encoding. + * (We generate these internally when writing CRAM.) + */ +typedef struct { + union { + struct { + int pos; + int code; + int base; // substitution code + } X; + struct { + int pos; + int code; + int base; // actual base & qual + int qual; + } B; + struct { + int pos; + int code; + int qual; + } Q; + struct { + int pos; + int code; + int len; + int seq_idx; // soft-clip multiple bases + } S; + struct { + int pos; + int code; + int len; + int seq_idx; // insertion multiple bases + } I; + struct { + int pos; + int code; + int base; // insertion single base + } i; + struct { + int pos; + int code; + int len; + } D; + struct { + int pos; + int code; + int len; + } N; + struct { + int pos; + int code; + int len; + } P; + struct { + int pos; + int code; + int len; + } H; + }; +} cram_feature; + +/* + * A slice is really just a set of blocks, but it + * is the logical unit for decoding a number of + * sequences. + */ +typedef struct cram_slice { + cram_block_slice_hdr *hdr; + cram_block *hdr_block; + cram_block **block; + cram_block **block_by_id; + + /* State used during encoding/decoding */ + int last_apos, max_apos; + + /* Identifier used for auto-assigning read names */ + uint64_t id; + + /* Array of decoded cram records */ + cram_record *crecs; + + /* An dynamically growing buffers for data pointed + * to by crecs[] array. + */ + uint32_t *cigar; + uint32_t cigar_alloc; + uint32_t ncigar; + cram_block *name_blk; + cram_block *seqs_blk; + cram_block *qual_blk; + cram_block *aux_blk; + cram_block *base_blk; // substitutions (soft-clips for 1.0) + cram_block *soft_blk; // soft-clips + + cram_feature *features; + int nfeatures; + int afeatures; // allocated size of features + +#ifndef TN_external + // TN field (Tag Name) + uint32_t *TN; + int nTN, aTN; // used and allocated size for TN[] +#else + cram_block *tn_blk; + int tn_id; +#endif + + string_alloc_t *pair_keys; // Pooled keys for pair hash. + khash_t(m_s2i) *pair; // for identifying read-pairs in this slice. + + char *ref; // slice of current reference + int ref_start; // start position of current reference; + int ref_end; // end position of current reference; + +#ifdef BA_external + int BA_len; + int ba_id; +#endif + int ref_id; +} cram_slice; + +/*----------------------------------------------------------------------------- + * Consider moving reference handling to cram_refs.[ch] + */ +// from fa.fai / samtools faidx files +typedef struct ref_entry { + char *name; + char *fn; + int64_t length; + int64_t offset; + int bases_per_line; + int line_length; + int64_t count; // for shared references so we know to dealloc seq + char *seq; +} ref_entry; + +KHASH_MAP_INIT_STR(refs, ref_entry*) + +// References structure. +typedef struct { + string_alloc_t *pool; // String pool for holding filenames and SN vals + + khash_t(refs) *h_meta; // ref_entry*, index by name + ref_entry **ref_id; // ref_entry*, index by ID + int nref; // number of ref_entry + + char *fn; // current file opened + FILE *fp; // and the FILE* to go with it. + + int count; // how many cram_fd sharing this refs struct + + pthread_mutex_t lock; // Mutex for multi-threaded updating + ref_entry *last; // Last queried sequence + int last_id; // Used in cram_ref_decr_locked to delay free +} refs_t; + +/*----------------------------------------------------------------------------- + * CRAM index + * + * Detect format by number of entries per line. + * 5 => 1.0 (refid, start, nseq, C offset, slice) + * 6 => 1.1 (refid, start, span, C offset, S offset, S size) + * + * Indices are stored in a nested containment list, which is trivial to set + * up as the indices are on sorted data so we're appending to the nclist + * in sorted order. Basically if a slice entirely fits within a previous + * slice then we append to that slices list. This is done recursively. + * + * Lists are sorted on two dimensions: ref id + slice coords. + */ +typedef struct cram_index { + int nslice, nalloc; // total number of slices + struct cram_index *e; // array of size nslice + + int refid; // 1.0 1.1 + int start; // 1.0 1.1 + int end; // 1.1 + int nseq; // 1.0 - undocumented + int slice; // 1.0 landmark index, 1.1 landmark value + int len; // 1.1 - size of slice in bytes + int64_t offset; // 1.0 1.1 +} cram_index; + +typedef struct { + int refid; + int start; + int end; +} cram_range; + +/*----------------------------------------------------------------------------- + */ +/* CRAM File handle */ + +typedef struct spare_bams { + bam_seq_t **bams; + struct spare_bams *next; +} spare_bams; + +typedef struct cram_fd { + cram_FILE *fp; + int mode; // 'r' or 'w' + int version; + cram_file_def *file_def; + SAM_hdr *header; + + char *prefix; + int record_counter; + int slice_num; + int err; + + // Most recent compression header decoded + //cram_block_compression_hdr *comp_hdr; + //cram_block_slice_hdr *slice_hdr; + + // Current container being processed. + cram_container *ctr; + + // positions for encoding or decoding + int first_base, last_base; + + // cached reference portion + refs_t *refs; // ref meta-data structure + char *ref, *ref_free; // current portion held in memory + int ref_id; + int ref_start; + int ref_end; + char *ref_fn; // reference fasta filename + + // compression level and metrics + int level; + cram_metrics *m[7]; + + // options + int decode_md; // Whether to export MD and NM tags + int verbose; + int seqs_per_slice; + int slices_per_container; + int embed_ref; + int no_ref; + int ignore_md5; + int use_bz2; + int shared_ref; + cram_range range; + + // lookup tables, stored here so we can be trivially multi-threaded + unsigned int bam_flag_swap[0x1000]; // cram -> bam flags + unsigned int cram_flag_swap[0x1000];// bam -> cram flags + unsigned char L1[256]; // ACGT{*} ->0123{4} + unsigned char L2[256]; // ACGTN{*}->01234{5} + char cram_sub_matrix[32][32]; // base substituion codes + + int index_sz; + cram_index *index; // array, sizeof index_sz + off_t first_container; + int eof; + int last_slice; // number of recs encoded in last slice + int multi_seq; + int unsorted; + int empty_container; // Marker for EOF block + + // thread pool + int own_pool; + t_pool *pool; + t_results_queue *rqueue; + pthread_mutex_t metrics_lock; + pthread_mutex_t ref_lock; + spare_bams *bl; + pthread_mutex_t bam_list_lock; + void *job_pending; + int ooc; // out of containers. +} cram_fd; + +enum cram_option { + CRAM_OPT_DECODE_MD, + CRAM_OPT_PREFIX, + CRAM_OPT_VERBOSITY, + CRAM_OPT_SEQS_PER_SLICE, + CRAM_OPT_SLICES_PER_CONTAINER, + CRAM_OPT_RANGE, + CRAM_OPT_VERSION, + CRAM_OPT_EMBED_REF, + CRAM_OPT_IGNORE_MD5, + CRAM_OPT_REFERENCE, + CRAM_OPT_MULTI_SEQ_PER_SLICE, + CRAM_OPT_NO_REF, + CRAM_OPT_USE_BZIP2, + CRAM_OPT_SHARED_REF, + CRAM_OPT_NTHREADS, + CRAM_OPT_THREAD_POOL, +}; + +/* BF bitfields */ +/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ +#define CRAM_FPAIRED 256 +#define CRAM_FPROPER_PAIR 128 +#define CRAM_FUNMAP 64 +#define CRAM_FREVERSE 32 +#define CRAM_FREAD1 16 +#define CRAM_FREAD2 8 +#define CRAM_FSECONDARY 4 +#define CRAM_FQCFAIL 2 +#define CRAM_FDUP 1 + +#define CRAM_M_REVERSE 1 +#define CRAM_M_UNMAP 2 + + +/* CF bitfields */ +#define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) +#define CRAM_FLAG_DETACHED (1<<1) +#define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) + +/* External IDs used by this implementation (only assumed during writing) */ +#define CRAM_EXT_IN 0 +#define CRAM_EXT_QUAL 1 +#define CRAM_EXT_NAME 2 +#define CRAM_EXT_TS_NP 3 +#define CRAM_EXT_TAG 4 +#define CRAM_EXT_TAG_S "\004" +#define CRAM_EXT_BA 5 +#define CRAM_EXT_TN 6 +#define CRAM_EXT_SC 7 +#define CRAM_EXT_REF 8 + +#ifdef __cplusplus +} +#endif + +#endif /* _CRAM_STRUCTS_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/files.c b/star-sys/STAR/source/htslib/cram/files.c new file mode 100644 index 0000000..48f5c32 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/files.c @@ -0,0 +1,76 @@ +/* +Copyright (c) 1994, 1996-1997, 2000, 2003 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1 Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2 Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include "cram/misc.h" + +#include +#include +/* Alliant's Concentrix is hugely deficient */ +/* Define things we require in this program */ +/* Methinks S_IFMT and S_IFDIR aren't defined in POSIX */ +#ifndef S_ISDIR +#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) +#endif /*!S_ISDIR*/ +#ifndef S_ISREG +#define S_ISREG(m) (((m)&S_IFMT) == S_IFREG) +#endif /*!S_ISREG*/ + +int is_directory(char * fn) +{ + struct stat buf; + if ( stat(fn,&buf) ) return 0; + return S_ISDIR(buf.st_mode); +} + +int is_file(char * fn) +{ + struct stat buf; + if ( stat(fn,&buf) ) return 0; + return S_ISREG(buf.st_mode); +} + +int file_exists(char * fn) +{ + struct stat buf; + return ( stat(fn,&buf) == 0); +} + +int file_size(char * fn) +{ + struct stat buf; + if ( stat(fn,&buf) != 0) return 0; + return buf.st_size; +} + diff --git a/star-sys/STAR/source/htslib/cram/mFILE.c b/star-sys/STAR/source/htslib/cram/mFILE.c new file mode 100644 index 0000000..6407b76 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/mFILE.c @@ -0,0 +1,634 @@ +/* +Copyright (c) 2005-2006, 2008-2009, 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cram/os.h" +#include "cram/mFILE.h" +#include "cram/vlen.h" + +/* + * This file contains memory-based versions of the most commonly used + * (by io_lib) stdio functions. + * + * Actual file IO takes place either on opening or closing an mFILE. + * + * Coupled to this are a bunch of rather scary macros which can be obtained + * by including stdio_hack.h. It is recommended though that you use mFILE.h + * instead and replace fopen with mfopen (etc). This is more or less + * mandatory if you wish to use both FILE and mFILE structs in a single file. + */ + +static mFILE *m_channel[3]; /* stdin, stdout and stderr fakes */ + +/* + * Reads the entirety of fp into memory. If 'fn' exists it is the filename + * associated with fp. This will be used for more optimal reading (via a + * stat to identify the size and a single read). Otherwise we use successive + * reads until EOF. + * + * Returns a malloced buffer on success of length *size + * NULL on failure + */ +static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) { + struct stat sb; + char *data = NULL; + size_t allocated = 0, used = 0; + int bufsize = 8192; + +#ifdef _WIN32 + if (binary) + _setmode(_fileno(fp), _O_BINARY); + else + _setmode(_fileno(fp), _O_TEXT); +#endif + + if (fn && -1 != stat(fn, &sb)) { + data = malloc(allocated = sb.st_size); + bufsize = sb.st_size; + } else { + fn = NULL; + } + + do { + size_t len; + if (used + bufsize > allocated) { + allocated += bufsize; + data = realloc(data, allocated); + } + len = fread(data + used, 1, allocated - used, fp); + if (len > 0) + used += len; + } while (!feof(fp) && (fn == NULL || used < sb.st_size)); + + *size = used; + + return data; +} + +/* + * Creates and returns m_channel[0]. + * We initialise this on the first attempted read, which then slurps in + * all of stdin until EOF is met. + */ +mFILE *mstdin(void) { + if (m_channel[0]) + return m_channel[0]; + + m_channel[0] = mfcreate(NULL, 0); + if (NULL == m_channel[0]) return NULL; + m_channel[0]->fp = stdin; + return m_channel[0]; +} + +static void init_mstdin(void) { + static int done_stdin = 0; + if (done_stdin) + return; + + m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1); + m_channel[0]->mode = MF_READ; + done_stdin = 1; +} + +/* + * Creates and returns m_channel[1]. This is the fake for stdout. It starts as + * an empty buffer which is physically written out only when mfflush or + * mfclose are called. + */ +mFILE *mstdout(void) { + if (m_channel[1]) + return m_channel[1]; + + m_channel[1] = mfcreate(NULL, 0); + if (NULL == m_channel[1]) return NULL; + m_channel[1]->fp = stdout; + m_channel[1]->mode = MF_WRITE; + return m_channel[1]; +} + +/* + * Stderr as an mFILE. + * The code handles stderr by returning m_channel[2], but also checking + * for stderr in fprintf (the common usage of it) to auto-flush. + */ +mFILE *mstderr(void) { + if (m_channel[2]) + return m_channel[2]; + + m_channel[2] = mfcreate(NULL, 0); + if (NULL == m_channel[2]) return NULL; + m_channel[2]->fp = stderr; + m_channel[2]->mode = MF_WRITE; + return m_channel[2]; +} + + +/* + * For creating existing mFILE pointers directly from memory buffers. + */ +mFILE *mfcreate(char *data, int size) { + mFILE *mf = (mFILE *)malloc(sizeof(*mf)); + if (NULL == mf) return NULL; + mf->fp = NULL; + mf->data = data; + mf->alloced = size; + mf->size = size; + mf->eof = 0; + mf->offset = 0; + mf->flush_pos = 0; + mf->mode = MF_READ | MF_WRITE; + return mf; +} + +/* + * Recreate an existing mFILE to house new data/size. + * It also rewinds the file. + */ +void mfrecreate(mFILE *mf, char *data, int size) { + if (mf->data) + free(mf->data); + mf->data = data; + mf->size = size; + mf->alloced = size; + mf->eof = 0; + mf->offset = 0; + mf->flush_pos = 0; +} + + +/* + * Creates a new mFILE to contain the contents of the FILE pointer. + * This mFILE is purely for in-memory operations and has no links to the + * original FILE* it came from. It also doesn't close the FILE pointer. + * Consider using mfreopen() is you need different behaviour. + * + * Returns mFILE * on success + * NULL on failure. + */ +mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) { + mFILE *mf; + + /* Open using mfreopen() */ + if (NULL == (mf = mfreopen(path, mode_str, fp))) + return NULL; + + /* Disassociate from the input stream */ + mf->fp = NULL; + + return mf; +} + +/* + * Converts a FILE * to an mFILE *. + * Use this for wrapper functions to turn external prototypes requring + * FILE * as an argument into internal code using mFILE *. + */ +mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) { + mFILE *mf; + int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0; + + /* Parse mode: + * r = read file contents (if truncated => don't read) + * w = write on close + * a = position at end of buffer + * x = position at same location as the original fp, don't seek on flush + */ + if (strchr(mode_str, 'r')) + r = 1, mode |= MF_READ; + if (strchr(mode_str, 'w')) + w = 1, mode |= MF_WRITE | MF_TRUNC; + if (strchr(mode_str, 'a')) + w = a = 1, mode |= MF_WRITE | MF_APPEND; + if (strchr(mode_str, 'b')) + b = 1, mode |= MF_BINARY; + if (strchr(mode_str, 'x')) + x = 1; + if (strchr(mode_str, '+')) { + w = 1, mode |= MF_READ | MF_WRITE; + if (a) + r = 1; + } + + if (r) { + mf = mfcreate(NULL, 0); + if (NULL == mf) return NULL; + if (!(mode & MF_TRUNC)) { + mf->data = mfload(fp, path, &mf->size, b); + mf->alloced = mf->size; + if (!a) + fseek(fp, 0, SEEK_SET); + } + } else if (w) { + /* Write - initialise the data structures */ + mf = mfcreate(NULL, 0); + if (NULL == mf) return NULL; + } else { + fprintf(stderr, "Must specify either r, w or a for mode\n"); + return NULL; + } + mf->fp = fp; + mf->mode = mode; + + if (x) { + mf->mode |= MF_MODEX; + } + + if (a) { + mf->flush_pos = mf->size; + fseek(fp, 0, SEEK_END); + } + + return mf; +} + +/* + * Opens a file. If we have read access (r or a+) then it loads the entire + * file into memory. If We have write access then the pathname is stored. + * We do not actually write until an mfclose, which then checks this pathname. + */ +mFILE *mfopen(const char *path, const char *mode) { + FILE *fp; + + if (NULL == (fp = fopen(path, mode))) + return NULL; + return mfreopen(path, mode, fp); +} + +/* + * Closes an mFILE. If the filename is known (implying write access) then this + * also writes the data to disk. + * + * Stdout is handled by calling mfflush which writes to stdout if appropriate. + */ +int mfclose(mFILE *mf) { + if (!mf) + return -1; + + mfflush(mf); + + if (mf->fp) + fclose(mf->fp); + + mfdestroy(mf); + + return 0; +} + +/* + * Closes the file pointer contained within the mFILE without destroying + * the in-memory data. + */ +int mfdetach(mFILE *mf) { + if (!mf) + return -1; + + mfflush(mf); + + if (mf->fp) { + fclose(mf->fp); + mf->fp = NULL; + } + + return 0; +} + +/* + * Destroys an mFILE structure but does not flush or close it + */ +int mfdestroy(mFILE *mf) { + if (!mf) + return -1; + + if (mf->data) + free(mf->data); + free(mf); + + return 0; +} + +/* + * Steals that data out of an mFILE. The mFILE itself will be closed. + * It is up to the caller to free the stolen buffer. If size_out is + * not NULL, mf->size will be stored in it. + * This is more-or-less the opposite of mfcreate(). + */ + +void *mfsteal(mFILE *mf, size_t *size_out) { + void *data; + + if (!mf) return NULL; + + data = mf->data; + + if (NULL != size_out) *size_out = mf->size; + + mfdetach(mf); + mf->data = NULL; + mfdestroy(mf); + + return data; +} + +/* + * Seek/tell functions. Nothing more than updating and reporting an + * in-memory index. NB we can seek on stdin or stdout even provided we + * haven't been flushing. + */ +int mfseek(mFILE *mf, long offset, int whence) { + switch (whence) { + case SEEK_SET: + mf->offset = offset; + break; + case SEEK_CUR: + mf->offset += offset; + break; + case SEEK_END: + mf->offset = mf->size + offset; + break; + default: + errno = EINVAL; + return -1; + } + + mf->eof = 0; + return 0; +} + +long mftell(mFILE *mf) { + return mf->offset; +} + +void mrewind(mFILE *mf) { + mf->offset = 0; + mf->eof = 0; +} + +/* + * mftruncate is not directly a translation of ftruncate as the latter + * takes a file descriptor instead of a FILE *. It performs the analogous + * role though. + * + * If offset is -1 then the file is truncated to be the current file + * offset. + */ +void mftruncate(mFILE *mf, long offset) { + mf->size = offset != -1 ? offset : mf->offset; + if (mf->offset > mf->size) + mf->offset = mf->size; +} + +int mfeof(mFILE *mf) { + return mf->eof; +} + +/* + * mFILE read/write functions. Basically these turn fread/fwrite syntax + * into memcpy statements, with appropriate memory handling for writing. + */ +size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) { + size_t len; + char *cptr = (char *)ptr; + + if (mf == m_channel[0]) init_mstdin(); + + if (mf->size <= mf->offset) + return 0; + + len = size * nmemb <= mf->size - mf->offset + ? size * nmemb + : mf->size - mf->offset; + if (!size) + return 0; + + memcpy(cptr, &mf->data[mf->offset], len); + mf->offset += len; + + if (len != size * nmemb) { + mf->eof = 1; + } + + return len / size; +} + +size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) { + if (!(mf->mode & MF_WRITE)) + return 0; + + /* Append mode => forced all writes to end of file */ + if (mf->mode & MF_APPEND) + mf->offset = mf->size; + + /* Make sure we have enough room */ + while (size * nmemb + mf->offset > mf->alloced) { + size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024; + void * new_data = realloc(mf->data, new_alloced); + if (NULL == new_data) return 0; + mf->alloced = new_alloced; + mf->data = new_data; + } + + /* Record where we need to reflush from */ + if (mf->offset < mf->flush_pos) + mf->flush_pos = mf->offset; + + /* Copy the data over */ + memcpy(&mf->data[mf->offset], ptr, size * nmemb); + mf->offset += size * nmemb; + if (mf->size < mf->offset) + mf->size = mf->offset; + + return nmemb; +} + +int mfgetc(mFILE *mf) { + if (mf == m_channel[0]) init_mstdin(); + if (mf->offset < mf->size) { + return (unsigned char)mf->data[mf->offset++]; + } + + mf->eof = 1; + return -1; +} + +int mungetc(int c, mFILE *mf) { + if (mf->offset > 0) { + mf->data[--mf->offset] = c; + return c; + } + + mf->eof = 1; + return -1; +} + +char *mfgets(char *s, int size, mFILE *mf) { + int i; + + if (mf == m_channel[0]) init_mstdin(); + *s = 0; + for (i = 0; i < size-1;) { + if (mf->offset < mf->size) { + s[i] = mf->data[mf->offset++]; + if (s[i++] == '\n') + break; + } else { + mf->eof = 1; + break; + } + } + + s[i] = 0; + return i ? s : NULL; +} + +/* + * Flushes an mFILE. If this is a real open of a file in write mode then + * mFILE->fp will be set. We then write out any new data in mFILE since the + * last flush. We cannot tell what may have been modified as we don't keep + * track of that, so we typically rewrite out the entire file contents between + * the last flush_pos and the end of file. + * + * For stderr/stdout we also reset the offsets so we cannot modify things + * we've already output. + */ +int mfflush(mFILE *mf) { + if (!mf->fp) + return 0; + + /* FIXME: only do this when opened in write mode */ + if (mf == m_channel[1] || mf == m_channel[2]) { + if (mf->flush_pos < mf->size) { + size_t bytes = mf->size - mf->flush_pos; + if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes) + return -1; + if (0 != fflush(mf->fp)) + return -1; + } + + /* Stdout & stderr are non-seekable streams so throw away the data */ + mf->offset = mf->size = mf->flush_pos = 0; + } + + /* only flush when opened in write mode */ + if (mf->mode & MF_WRITE) { + if (mf->flush_pos < mf->size) { + size_t bytes = mf->size - mf->flush_pos; + if (!(mf->mode & MF_MODEX)) { + fseek(mf->fp, mf->flush_pos, SEEK_SET); + } + if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes) + return -1; + if (0 != fflush(mf->fp)) + return -1; + } + if (ftell(mf->fp) != -1 && + ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1) + return -1; + mf->flush_pos = mf->size; + } + + return 0; +} + +/* + * A wrapper around vsprintf() to write to an mFILE. This also uses vflen() to + * estimate how many additional bytes of storage will be required for the + * vsprintf to work. + */ +int mfprintf(mFILE *mf, char *fmt, ...) { + int ret; + size_t est_length; + va_list args; + + va_start(args, fmt); + est_length = vflen(fmt, args); + va_end(args); + while (est_length + mf->offset > mf->alloced) { + size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024; + void * new_data = realloc(mf->data, new_alloced); + if (NULL == new_data) return -1; + mf->alloced = new_alloced; + mf->data = new_data; + } + + va_start(args, fmt); + ret = vsprintf(&mf->data[mf->offset], fmt, args); + va_end(args); + + if (ret > 0) { + mf->offset += ret; + if (mf->size < mf->offset) + mf->size = mf->offset; + } + + if (mf->fp == stderr) { + /* Auto-flush for stderr */ + if (0 != mfflush(mf)) return -1; + } + + return ret; +} + +/* + * Converts an mFILE from binary to ascii mode by replacing all + * cr-nl with nl. + * + * Primarily used on windows when we've uncompressed a binary file which + * happens to be a text file (eg Experiment File). Previously we would have + * seeked back to the start and used _setmode(fileno(fp), _O_TEXT). + * + * Side effect: resets offset and flush_pos back to the start. + */ +void mfascii(mFILE *mf) { + size_t p1, p2; + + for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) { + if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') { + p2--; /* delete the \r */ + } + mf->data[p2] = mf->data[p1]; + } + mf->size = p2; + + mf->offset = mf->flush_pos = 0; +} diff --git a/star-sys/STAR/source/htslib/cram/mFILE.h b/star-sys/STAR/source/htslib/cram/mFILE.h new file mode 100644 index 0000000..5ded9a5 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/mFILE.h @@ -0,0 +1,88 @@ +/* +Copyright (c) 2005-2006, 2008-2009 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _MFILE_H_ +#define _MFILE_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + FILE *fp; + char *data; + size_t alloced; + int eof; + int mode; /* open mode in MF_?? define bit pattern */ + size_t size; + size_t offset; + size_t flush_pos; +} mFILE; + +#define MF_READ 1 +#define MF_WRITE 2 +#define MF_APPEND 4 +#define MF_BINARY 8 +#define MF_TRUNC 16 +#define MF_MODEX 32 + +mFILE *mfreopen(const char *path, const char *mode, FILE *fp); +mFILE *mfopen(const char *path, const char *mode); +int mfdetach(mFILE *mf); +int mfclose(mFILE *mf); +int mfdestroy(mFILE *mf); +int mfseek(mFILE *mf, long offset, int whence); +long mftell(mFILE *mf); +void mrewind(mFILE *mf); +void mftruncate(mFILE *mf, long offset); +int mfeof(mFILE *mf); +size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf); +size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf); +int mfgetc(mFILE *mf); +int mungetc(int c, mFILE *mf); +mFILE *mfcreate(char *data, int size); +mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp); +void mfrecreate(mFILE *mf, char *data, int size); +void *mfsteal(mFILE *mf, size_t *size_out); +char *mfgets(char *s, int size, mFILE *mf); +int mfflush(mFILE *mf); +int mfprintf(mFILE *mf, char *fmt, ...); +mFILE *mstdin(void); +mFILE *mstdout(void); +mFILE *mstderr(void); +void mfascii(mFILE *mf); + +#ifdef __cplusplus +} +#endif + +#endif /* _MFILE_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/md5.c b/star-sys/STAR/source/htslib/cram/md5.c new file mode 100644 index 0000000..68123c0 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/md5.c @@ -0,0 +1,295 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * (This is a heavily cut-down "BSD license".) + * + * This differs from Colin Plumb's older public domain implementation in that + * no exactly 32-bit integer data type is required (any 32-bit or wider + * unsigned integer data type will do), there's no compile-time endianness + * configuration, and the function prototypes match OpenSSL's. No code from + * Colin Plumb's implementation has been reused; this comment merely compares + * the properties of the two independent implementations. + * + * The primary goals of this implementation are portability and ease of use. + * It is meant to be fast, but not as fast as possible. Some known + * optimizations are not included to reduce source code size and avoid + * compile-time configuration. + */ + +#ifndef HAVE_OPENSSL + +#include + +#include "md5.h" + +/* + * The basic MD5 functions. + * + * F and G are optimized compared to their RFC 1321 definitions for + * architectures that lack an AND-NOT instruction, just like in Colin Plumb's + * implementation. + */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | ~(z))) + +/* + * The MD5 transformation for all four rounds. + */ +#define STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ + (a) += (b); + +/* + * SET reads 4 input bytes in little-endian byte order and stores them + * in a properly aligned word in host byte order. + * + * The check for little-endian architectures that tolerate unaligned + * memory accesses is just an optimization. Nothing will break if it + * doesn't work. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__vax__) +#define SET(n) \ + (*(MD5_u32plus *)&ptr[(n) * 4]) +#define GET(n) \ + SET(n) +#else +#define SET(n) \ + (ctx->block[(n)] = \ + (MD5_u32plus)ptr[(n) * 4] | \ + ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ + ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ + ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) +#define GET(n) \ + (ctx->block[(n)]) +#endif + +/* + * This processes one or more 64-byte data blocks, but does NOT update + * the bit counters. There are no alignment requirements. + */ +static void *body(MD5_CTX *ctx, void *data, unsigned long size) +{ + unsigned char *ptr; + MD5_u32plus a, b, c, d; + MD5_u32plus saved_a, saved_b, saved_c, saved_d; + + ptr = data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + +/* Round 1 */ + STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) + STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) + STEP(F, c, d, a, b, SET(2), 0x242070db, 17) + STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) + STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) + STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) + STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) + STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) + STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) + STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) + STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) + STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) + STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) + STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) + STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) + STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) + +/* Round 2 */ + STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) + STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) + STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) + STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) + STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) + STEP(G, d, a, b, c, GET(10), 0x02441453, 9) + STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) + STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) + STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) + STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) + STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) + STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) + STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) + STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) + STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) + STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) + +/* Round 3 */ + STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) + STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) + STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) + STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) + STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) + STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) + STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) + STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) + STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) + STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) + STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) + STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) + STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) + STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) + STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) + STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) + +/* Round 4 */ + STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) + STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) + STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) + STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) + STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) + STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) + STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) + STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) + STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) + STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) + STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) + STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) + STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) + STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) + STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) + STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; +} + +void MD5_Init(MD5_CTX *ctx) +{ + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; + + ctx->lo = 0; + ctx->hi = 0; +} + +void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) +{ + MD5_u32plus saved_lo; + unsigned long used, free; + + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) + ctx->hi++; + ctx->hi += size >> 29; + + used = saved_lo & 0x3f; + + if (used) { + free = 64 - used; + + if (size < free) { + memcpy(&ctx->buffer[used], data, size); + return; + } + + memcpy(&ctx->buffer[used], data, free); + data = (unsigned char *)data + free; + size -= free; + body(ctx, ctx->buffer, 64); + } + + if (size >= 64) { + data = body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } + + memcpy(ctx->buffer, data, size); +} + +void MD5_Final(unsigned char *result, MD5_CTX *ctx) +{ + unsigned long used, free; + + used = ctx->lo & 0x3f; + + ctx->buffer[used++] = 0x80; + + free = 64 - used; + + if (free < 8) { + memset(&ctx->buffer[used], 0, free); + body(ctx, ctx->buffer, 64); + used = 0; + free = 64; + } + + memset(&ctx->buffer[used], 0, free - 8); + + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; + + body(ctx, ctx->buffer, 64); + + result[0] = ctx->a; + result[1] = ctx->a >> 8; + result[2] = ctx->a >> 16; + result[3] = ctx->a >> 24; + result[4] = ctx->b; + result[5] = ctx->b >> 8; + result[6] = ctx->b >> 16; + result[7] = ctx->b >> 24; + result[8] = ctx->c; + result[9] = ctx->c >> 8; + result[10] = ctx->c >> 16; + result[11] = ctx->c >> 24; + result[12] = ctx->d; + result[13] = ctx->d >> 8; + result[14] = ctx->d >> 16; + result[15] = ctx->d >> 24; + + memset(ctx, 0, sizeof(*ctx)); +} + +#endif diff --git a/star-sys/STAR/source/htslib/cram/md5.h b/star-sys/STAR/source/htslib/cram/md5.h new file mode 100644 index 0000000..6b065a1 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/md5.h @@ -0,0 +1,54 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * See md5.c for more information. + */ + +#ifdef HAVE_OPENSSL +#include +#elif !defined(_MD5_H) +#define _MD5_H + +/* Any 32-bit or wider unsigned integer data type will do */ +typedef unsigned int MD5_u32plus; + +typedef struct { + MD5_u32plus lo, hi; + MD5_u32plus a, b, c, d; + unsigned char buffer[64]; + MD5_u32plus block[16]; +} MD5_CTX; + +extern void MD5_Init(MD5_CTX *ctx); +extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); +extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); + +#endif + +#ifdef __cplusplus +} +#endif + diff --git a/star-sys/STAR/source/htslib/cram/misc.h b/star-sys/STAR/source/htslib/cram/misc.h new file mode 100644 index 0000000..681b28c --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/misc.h @@ -0,0 +1,110 @@ +/* +Copyright (c) 1994-1997, 2001-2002 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1 Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2 Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +Copyright (c) 2003-2013 Genome Research Ltd. + +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _misc_h +#define _misc_h + +#include "cram/os.h" + +#include +#include /* varargs needed for v*printf() prototypes */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This informs gcc that crash() doesn't return, so it doesn't need to + * concern itself that code paths going via crash could mean some variables + * being undefined and then issuing uninitialised variable warnings. + * This particularly affected convert. + */ +#ifdef __GNUC__ +# define __NORETURN__ __attribute__ ((__noreturn__)) +#else +# define __NORETURN__ +#endif + +/* + * Used for printf style argument checking. We can request a function such + * as vTcl_SetResult does argument checking, avoiding bugs with using + * %d and passing in a 64-bit record. + */ +#ifdef __GNUC__ +# define __PRINTF_FORMAT__(a,b) __attribute__ ((format (printf, a, b))) +#else +# define __PRINTF_FORMAT__(a,b) +#endif + +extern int is_directory(char * fn); +extern int is_file(char * fn); +extern int file_size(char * fn); + +#define MIN(A,B) ( ( (A) < (B) ) ? (A) : (B) ) +#define MAX(A,B) ( ( (A) > (B) ) ? (A) : (B) ) + +#ifdef __cplusplus +} +#endif + +#endif /*_misc_h*/ diff --git a/star-sys/STAR/source/htslib/cram/open_trace_file.c b/star-sys/STAR/source/htslib/cram/open_trace_file.c new file mode 100644 index 0000000..d89815d --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/open_trace_file.c @@ -0,0 +1,386 @@ +/* +Author: James Bonfield + +Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +Copyright (c) 2008, 2009, 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cram/os.h" +#ifndef PATH_MAX +# define PATH_MAX 1024 +#endif +#ifdef HAVE_LIBCURL +# include +#endif + +#include "cram/open_trace_file.h" +#include "cram/misc.h" + +/* + * Tokenises the search path splitting on colons (unix) or semicolons + * (windows). + * We also explicitly add a "./" to the end of the search path + * + * Returns: A new search path with items separated by nul chars. Two nul + * chars in a row represent the end of the tokenised path. + * Returns NULL for a failure. + * + * The returned data has been malloced. It is up to the caller to free this + * memory. + */ +char *tokenise_search_path(char *searchpath) { + char *newsearch; + unsigned int i, j; + size_t len; +#ifdef _WIN32 + char path_sep = ';'; +#else + char path_sep = ':'; +#endif + + if (!searchpath) + searchpath=""; + + newsearch = (char *)malloc((len = strlen(searchpath))+5); + if (!newsearch) + return NULL; + + for (i = 0, j = 0; i < len; i++) { + /* "::" => ":". Used for escaping colons in http://foo */ + if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { + newsearch[j++] = ':'; + i++; + continue; + } + + if (searchpath[i] == path_sep) { + /* Skip blank path components */ + if (j && newsearch[j-1] != 0) + newsearch[j++] = 0; + } else { + newsearch[j++] = searchpath[i]; + } + } + + if (j) + newsearch[j++] = 0; + newsearch[j++] = '.'; + newsearch[j++] = '/'; + newsearch[j++] = 0; + newsearch[j++] = 0; + + return newsearch; +} + +#ifdef HAVE_LIBCURL +mFILE *find_file_url(char *file, char *url) { + char buf[8192], *cp; + mFILE *mf = NULL, *headers = NULL; + int maxlen = 8190 - strlen(file); + static CURL *handle = NULL; + static int curl_init = 0; + char errbuf[CURL_ERROR_SIZE]; + + *errbuf = 0; + + if (!curl_init) { + if (curl_global_init(CURL_GLOBAL_ALL)) + return NULL; + + if (NULL == (handle = curl_easy_init())) + goto error; + + curl_init = 1; + } + + /* Expand %s for the trace name */ + for (cp = buf; *url && cp - buf < maxlen; url++) { + if (*url == '%' && *(url+1) == 's') { + url++; + cp += strlen(strcpy(cp, file)); + } else { + *cp++ = *url; + } + } + *cp++ = 0; + + /* Setup the curl */ + if (NULL == (mf = mfcreate(NULL, 0)) || + NULL == (headers = mfcreate(NULL, 0))) + return NULL; + + if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 60L)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, + (curl_write_callback)mfwrite)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION, + (curl_write_callback)mfwrite)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf)) + goto error; + + /* Fetch! */ + if (0 != curl_easy_perform(handle)) + goto error; + + /* Report errors is approproate. 404 is silent as it may have just been + * a search via RAWDATA path, everything else is worth reporting. + */ + { + float version; + int response; + char nul = 0; + mfwrite(&nul, 1, 1, headers); + if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) { + if (response != 200) { + if (response != 404) + fprintf(stderr, "%.*s\n", + (int)headers->size, headers->data); + goto error; + } + } + } + + if (mftell(mf) == 0) + goto error; + + mfdestroy(headers); + + mrewind(mf); + return mf; + + error: + if (mf) + mfdestroy(mf); + if (headers) + mfdestroy(headers); + if (*errbuf) + fprintf(stderr, "%s\n", errbuf); + return NULL; +} +#endif + +/* + * Searches for file in the directory 'dirname'. If it finds it, it opens + * it. This also searches for compressed versions of the file in dirname + * too. + * + * Returns mFILE pointer if found + * NULL if not + */ +static mFILE *find_file_dir(char *file, char *dirname) { + char path[PATH_MAX+1]; + size_t len = strlen(dirname); + char *cp; + + if (dirname[len-1] == '/') + len--; + + /* Special case for "./" or absolute filenames */ + if (*file == '/' || (len==1 && *dirname == '.')) { + sprintf(path, "%s", file); + } else { + /* Handle %[0-9]*s expansions, if required */ + char *path_end = path; + *path = 0; + while ((cp = strchr(dirname, '%'))) { + char *endp; + long l = strtol(cp+1, &endp, 10); + if (*endp != 's') { + strncpy(path_end, dirname, (endp+1)-dirname); + path_end += (endp+1)-dirname; + dirname = endp+1; + continue; + } + + strncpy(path_end, dirname, cp-dirname); + path_end += cp-dirname; + if (l) { + strncpy(path_end, file, l); + path_end += MIN(strlen(file), l); + file += MIN(strlen(file), l); + } else { + strcpy(path_end, file); + path_end += strlen(file); + file += strlen(file); + } + len -= (endp+1) - dirname; + dirname = endp+1; + } + strncpy(path_end, dirname, len); + path_end += MIN(strlen(dirname), len); + *path_end = 0; + if (*file) { + *path_end++ = '/'; + strcpy(path_end, file); + } + + //fprintf(stderr, "*PATH=\"%s\"\n", path); + } + + if (is_file(path)) { + return mfopen(path, "rb"); + } + + return NULL; +} + +/* + * ------------------------------------------------------------------------ + * Public functions below. + */ + +/* + * Opens a trace file named 'file'. This is initially looked for as a + * pathname relative to a file named "relative_to". This may (for + * example) be the name of an experiment file referencing the trace + * file. In this case by passing relative_to as the experiment file + * filename the trace file will be picked up in the same directory as + * the experiment file. Relative_to may be supplied as NULL. + * + * 'file' is looked for at relative_to, then the current directory, and then + * all of the locations listed in 'path' (which is a colon separated list). + * If 'path' is NULL it uses the RAWDATA environment variable instead. + * + * Returns a mFILE pointer when found. + * NULL otherwise. + */ +mFILE *open_path_mfile(char *file, char *path, char *relative_to) { + char *newsearch; + char *ele; + mFILE *fp; + + /* Use path first */ + if (!path) + path = getenv("RAWDATA"); + if (NULL == (newsearch = tokenise_search_path(path))) + return NULL; + + /* + * Step through the search path testing out each component. + * We now look through each path element treating some prefixes as + * special, otherwise we treat the element as a directory. + */ + for (ele = newsearch; *ele; ele += strlen(ele)+1) { + int i; + char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"}; + for (i = 0; i < 6; i++) { + char file2[1024]; + char *ele2; + int valid = 1; + + /* + * '|' prefixing a path component indicates that we do not + * wish to perform the compression extension searching in that + * location. + */ + if (*ele == '|') { + ele2 = ele+1; + valid = (i == 0); + } else { + ele2 = ele; + } + + sprintf(file2, "%s%s", file, suffix[i]); + +#if defined(HAVE_LIBCURL) + if (0 == strncmp(ele2, "URL=", 4)) { + if (valid && (fp = find_file_url(file2, ele2+4))) { + free(newsearch); + return fp; + } + } else +#endif + if (valid && (fp = find_file_dir(file2, ele2))) { + free(newsearch); + return fp; + } + } + } + + free(newsearch); + + /* Look in the same location as the incoming 'relative_to' filename */ + if (relative_to) { + char *cp; + char relative_path[PATH_MAX+1]; + strcpy(relative_path, relative_to); + if ((cp = strrchr(relative_path, '/'))) + *cp = 0; + if ((fp = find_file_dir(file, relative_path))) + return fp; + } + + return NULL; +} diff --git a/star-sys/STAR/source/htslib/cram/open_trace_file.h b/star-sys/STAR/source/htslib/cram/open_trace_file.h new file mode 100644 index 0000000..ee52cb0 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/open_trace_file.h @@ -0,0 +1,115 @@ +/* +Author: James Bonfield + +Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + . Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + . Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + . Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +Copyright (c) 2008, 2009, 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _OPEN_TRACE_FILE_H_ +#define _OPEN_TRACE_FILE_H_ + +#include "cram/mFILE.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Tokenises the search path splitting on colons (unix) or semicolons + * (windows). + * We also explicitly add a "./" to the end of the search path + * + * Returns: A new search path with items separated by nul chars. Two nul + * chars in a row represent the end of the tokenised path. + * Returns NULL for a failure. + * + * The returned data has been malloced. It is up to the caller to free this + * memory. + */ +char *tokenise_search_path(char *searchpath); + +/* + * Opens a trace file named 'file'. This is initially looked for as a + * pathname relative to a file named "relative_to". This may (for + * example) be the name of an experiment file referencing the trace + * file. In this case by passing relative_to as the experiment file + * filename the trace file will be picked up in the same directory as + * the experiment file. Relative_to may be supplied as NULL. + * + * 'file' is looked for at relative_to, then the current directory, and then + * all of the locations listed in 'path' (which is a colon separated list). + * If 'path' is NULL it uses the RAWDATA environment variable instead. + * + * Returns a mFILE pointer when found. + * NULL otherwise. + */ +mFILE *open_path_mfile(char *file, char *path, char *relative_to); + +/* + * Returns a mFILE containing the entire contents of the url; + * NULL on failure. + */ +mFILE *find_file_url(char *file, char *url); + + +#ifdef __cplusplus +} +#endif + +#endif /* _OPEN_TRACE_FILE_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/os.h b/star-sys/STAR/source/htslib/cram/os.h new file mode 100644 index 0000000..b2affe0 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/os.h @@ -0,0 +1,306 @@ +/* +Copyright (c) 1993, 1995-2002 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1 Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2 Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +Copyright (c) 2004, 2006, 2009-2011, 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * File: os.h + * + * Author: + * MRC Laboratory of Molecular Biology + * Hills Road + * Cambridge CB2 2QH + * United Kingdom + * + * Description: operating system specific type definitions + * + */ + +#ifndef _OS_H_ +#define _OS_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------- + * Detection of endianness. The main part of this is done in autoconf, but + * for the case of MacOS FAT binaries we fall back on auto-sensing based on + * processor type too. + */ + +/* Set by autoconf */ +#define SP_LITTLE_ENDIAN + +/* Mac FAT binaries or unknown. Auto detect based on CPU type */ +#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN) + +/* + * x86 equivalents + */ +#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686) +# if defined(SP_BIG_ENDIAN) +# undef SP_BIG_ENDIAN +# endif +# define SP_LITTLE_ENDIAN +#endif + +/* + * DEC Alpha + */ +#if defined(__alpha__) || defined(__alpha) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* + * SUN Sparc + */ +#if defined(__sparc__) || defined(__sparc) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* + * PowerPC + */ +#if defined(__ppc__) || defined(__ppc) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* Some catch-alls */ +#if defined(__LITTLE_ENDIAN__) || defined(__LITTLEENDIAN__) +# define SP_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN__) || defined(__BIGENDIAN__) +# define SP_BIG_ENDIAN +#endif + +#if defined(SP_BIG_ENDIAN) && defined(SP_LITTLE_ENDIAN) +# error Both BIG and LITTLE endian defined. Fix os.h and/or Makefile +#endif + +#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN) +# error Neither BIG nor LITTLE endian defined. Fix os.h and/or Makefile +#endif + +#endif + +/*----------------------------------------------------------------------------- + * Allow for unaligned memory access. This is used in BAM code as the packed + * structure has 4-byte cigar ints after the variable length name. + * + * Consider using AX_CHECK_ALIGNED_ACCESS_REQUIRED in autoconf. + */ +#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686) +# define ALLOW_UAC +#endif + +/*----------------------------------------------------------------------------- + * Byte swapping macros + */ + +/* + * Our new swap runs at the same speed on Ultrix, but substantially faster + * (300% for swap_int4, ~50% for swap_int2) on an Alpha (due to the lack of + * decent 'char' support). + * + * They also have the ability to swap in situ (src == dst). Newer code now + * relies on this so don't change back! + */ +#define iswap_int8(x) \ + (((x & 0x00000000000000ffLL) << 56) + \ + ((x & 0x000000000000ff00LL) << 40) + \ + ((x & 0x0000000000ff0000LL) << 24) + \ + ((x & 0x00000000ff000000LL) << 8) + \ + ((x & 0x000000ff00000000LL) >> 8) + \ + ((x & 0x0000ff0000000000LL) >> 24) + \ + ((x & 0x00ff000000000000LL) >> 40) + \ + ((x & 0xff00000000000000LL) >> 56)) + +#define iswap_int4(x) \ + (((x & 0x000000ff) << 24) + \ + ((x & 0x0000ff00) << 8) + \ + ((x & 0x00ff0000) >> 8) + \ + ((x & 0xff000000) >> 24)) + +#define iswap_int2(x) \ + (((x & 0x00ff) << 8) + \ + ((x & 0xff00) >> 8)) + +/* + * Linux systems may use byteswap.h to get assembly versions of byte-swap + * on intel systems. This can be as trivial as the bswap opcode, which works + * out at over 2-times faster than iswap_int4 above. + */ +#if 0 +#if defined(__linux__) +# include +# undef iswap_int8 +# undef iswap_int4 +# undef iswap_int2 +# define iswap_int8 bswap_64 +# define iswap_int4 bswap_32 +# define iswap_int2 bswap_16 +#endif +#endif + + +/* + * Macros to specify that data read in is of a particular endianness. + * The macros here swap to the appropriate order for the particular machine + * running the macro and return the new answer. These may also be used when + * writing to a file to specify that we wish to write in (eg) big endian + * format. + * + * This leads to efficient code as most of the time these macros are + * trivial. + */ +#ifdef SP_BIG_ENDIAN +#define le_int4(x) iswap_int4((x)) +#endif + +#ifdef SP_LITTLE_ENDIAN +#define le_int4(x) (x) +#endif + +/*----------------------------------------------------------------------------- + * definitions, incase they're not present + */ + +#ifndef PRId64 +#define __PRI64__ "l" +#define PRId64 __PRI64__ "d" +#define PRId32 "d" +#define PRId16 "d" +#define PRId8 "d" +#define PRIu64 __PRI64__ "u" +#define PRIu32 "u" +#define PRIu16 "u" +#define PRIu8 "u" +#endif + +/*----------------------------------------------------------------------------- + * Operating system specifics. + * These ought to be done by autoconf, but are legacy code. + */ +/* + * SunOS 4.x + * Even though we use the ANSI gcc, we make use the the standard SunOS 4.x + * libraries and include files, which are non-ansi + */ +#if defined(__sun__) && !defined(__svr4__) +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 +#endif + +/* + * Microsoft Visual C++ + * Windows + */ +#if defined(_MSC_VER) +#define popen _popen +#define pclose _pclose +#define ftruncate(fd,len) _chsize(fd,len) +#endif + + +/* + * Microsoft Windows running MinGW + */ +#if defined(__MINGW32__) +/* #define mkdir(filename,mode) mkdir((filename)) */ +#define sysconf(x) 512 +#define ftruncate(fd,len) _chsize(fd,len) +#endif + +/* Generic WIN32 API issues */ +#ifdef _WIN32 +# ifndef HAVE_FSEEKO +# if __MSVCRT_VERSION__ >= 0x800 + /* if you have MSVCR80 installed then you can use these definitions: */ +# define off_t __int64 +# define fseeko _fseeki64 +# define ftello _ftelli64 +# else + /* otherwise we're stuck with 32-bit file support */ +# define off_t long +# define fseeko fseek +# define ftello ftell +# endif +# endif /* !HAVE_FSEEKO */ +#endif /* _WIN32 */ + +#ifdef __cplusplus +} +#endif + +#endif /*_OS_H_*/ diff --git a/star-sys/STAR/source/htslib/cram/pooled_alloc.c b/star-sys/STAR/source/htslib/cram/pooled_alloc.c new file mode 100644 index 0000000..0d061e7 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/pooled_alloc.c @@ -0,0 +1,170 @@ +/* +Copyright (c) 2009 Genome Research Ltd. +Author: Rob Davies + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include + +#include "cram/pooled_alloc.h" + +//#define TEST_MAIN + +#define PSIZE 1024*1024 + +pool_alloc_t *pool_create(size_t dsize) { + pool_alloc_t *p; + + if (NULL == (p = (pool_alloc_t *)malloc(sizeof(*p)))) + return NULL; + + /* Minimum size is a pointer, for free list */ + dsize = (dsize + sizeof(void *) - 1) & ~(sizeof(void *)-1); + if (dsize < sizeof(void *)) + dsize = sizeof(void *); + p->dsize = dsize; + + p->npools = 0; + p->pools = NULL; + p->free = NULL; + + return p; +} + +static pool_t *new_pool(pool_alloc_t *p) { + size_t n = PSIZE / p->dsize; + pool_t *pool; + + pool = realloc(p->pools, (p->npools + 1) * sizeof(*p->pools)); + if (NULL == pool) return NULL; + p->pools = pool; + pool = &p->pools[p->npools]; + + pool->pool = malloc(n * p->dsize); + if (NULL == pool->pool) return NULL; + + pool->used = 0; + + p->npools++; + + return pool; +} + +void pool_destroy(pool_alloc_t *p) { + size_t i; + + for (i = 0; i < p->npools; i++) { + free(p->pools[i].pool); + } + free(p->pools); + free(p); +} + +void *pool_alloc(pool_alloc_t *p) { + pool_t *pool; + void *ret; + + /* Look on free list */ + if (NULL != p->free) { + ret = p->free; + p->free = *((void **)p->free); + return ret; + } + + /* Look for space in the last pool */ + if (p->npools) { + pool = &p->pools[p->npools - 1]; + if (pool->used + p->dsize < PSIZE) { + ret = ((char *) pool->pool) + pool->used; + pool->used += p->dsize; + return ret; + } + } + + /* Need a new pool */ + pool = new_pool(p); + if (NULL == pool) return NULL; + + pool->used = p->dsize; + return pool->pool; +} + +void pool_free(pool_alloc_t *p, void *ptr) { + *(void **)ptr = p->free; + p->free = ptr; +} + +#ifdef TEST_MAIN +typedef struct { + int x, y, z; +} xyz; + +#define NP 10000 +int main(void) { + int i; + xyz *item; + xyz **items; + pool_alloc_t *p = pool_create(sizeof(xyz)); + + items = (xyz **)malloc(NP * sizeof(*items)); + + for (i = 0; i < NP; i++) { + item = pool_alloc(p); + item->x = i; + item->y = i+1; + item->z = i+2; + items[i] = item; + } + + for (i = 0; i < NP; i++) { + item = items[i]; + if (i % 3) + pool_free(p, item); + } + + for (i = 0; i < NP; i++) { + item = pool_alloc(p); + item->x = 1000000+i; + item->y = 1000000+i+1; + item->z = 1000000+i+2; + } + + for (i = 0; i < NP; i++) { + item = items[i]; + printf("%d\t%d\t%d\t%d\n", i, item->x, item->y, item->z); + pool_free(p, item); + } + + return 0; +} +#endif diff --git a/star-sys/STAR/source/htslib/cram/pooled_alloc.h b/star-sys/STAR/source/htslib/cram/pooled_alloc.h new file mode 100644 index 0000000..fa3c81f --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/pooled_alloc.h @@ -0,0 +1,56 @@ +/* +Copyright (c) 2009 Genome Research Ltd. +Author: Rob Davies + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _POOLED_ALLOC_H_ +#define _POOLED_ALLOC_H_ + +/* + * Implements a pooled block allocator where all items are the same size, + * but we need many of them. + */ +typedef struct { + void *pool; + size_t used; +} pool_t; + +typedef struct { + size_t dsize; + size_t npools; + pool_t *pools; + void *free; +} pool_alloc_t; + +pool_alloc_t *pool_create(size_t dsize); +void pool_destroy(pool_alloc_t *p); +void *pool_alloc(pool_alloc_t *p); +void pool_free(pool_alloc_t *p, void *ptr); + + +#endif /*_POOLED_ALLOC_H_*/ diff --git a/star-sys/STAR/source/htslib/cram/sam_header.c b/star-sys/STAR/source/htslib/cram/sam_header.c new file mode 100644 index 0000000..2a8110c --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/sam_header.c @@ -0,0 +1,1222 @@ +/* +Copyright (c) 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include + +#include "cram/sam_header.h" +#include "cram/string_alloc.h" + +#ifdef SAMTOOLS +#define sam_hdr_parse sam_hdr_parse_ +#endif + +static void sam_hdr_error(char *msg, char *line, int len, int lno) { + int j; + + for (j = 0; j < len && line[j] != '\n'; j++) + ; + fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line); +} + +void sam_hdr_dump(SAM_hdr *hdr) { + khint_t k; + int i; + + printf("===DUMP===\n"); + for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { + SAM_hdr_type *t1, *t2; + char c[2]; + + if (!kh_exist(hdr->h, k)) + continue; + + t1 = t2 = kh_val(hdr->h, k); + c[0] = kh_key(hdr->h, k)>>8; + c[1] = kh_key(hdr->h, k)&0xff; + printf("Type %.2s, count %d\n", c, t1->prev->order+1); + + do { + SAM_hdr_tag *tag; + printf(">>>%d ", t1->order); + for (tag = t1->tag; tag; tag=tag->next) { + printf("\"%.2s\":\"%.*s\"\t", + tag->str, tag->len-3, tag->str+3); + } + putchar('\n'); + t1 = t1->next; + } while (t1 != t2); + } + + /* Dump out PG chains */ + printf("\n@PG chains:\n"); + for (i = 0; i < hdr->npg_end; i++) { + int j; + printf(" %d:", i); + for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) { + printf("%s%d(%.*s)", + j == hdr->pg_end[i] ? " " : "->", + j, hdr->pg[j].name_len, hdr->pg[j].name); + } + printf("\n"); + } + + puts("===END DUMP==="); +} + +/* Updates the hash tables in the SAM_hdr structure. + * + * Returns 0 on success; + * -1 on failure + */ +static int sam_hdr_update_hashes(SAM_hdr *sh, + int type, + SAM_hdr_type *h_type) { + /* Add to reference hash? */ + if ((type>>8) == 'S' && (type&0xff) == 'Q') { + SAM_hdr_tag *tag; + int nref = sh->nref; + + sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref)); + if (!sh->ref) + return -1; + + tag = h_type->tag; + sh->ref[nref].name = NULL; + sh->ref[nref].len = 0; + sh->ref[nref].ty = h_type; + sh->ref[nref].tag = tag; + + while (tag) { + if (tag->str[0] == 'S' && tag->str[1] == 'N') { + if (!(sh->ref[nref].name = malloc(tag->len))) + return -1; + strncpy(sh->ref[nref].name, tag->str+3, tag->len-3); + sh->ref[nref].name[tag->len-3] = 0; + } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { + sh->ref[nref].len = atoi(tag->str+3); + } + tag = tag->next; + } + + if (sh->ref[nref].name) { + khint_t k; + int r; + k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r); + if (-1 == r) return -1; + kh_val(sh->ref_hash, k) = nref; + } + + sh->nref++; + } + + /* Add to read-group hash? */ + if ((type>>8) == 'R' && (type&0xff) == 'G') { + SAM_hdr_tag *tag; + int nrg = sh->nrg; + + sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg)); + if (!sh->rg) + return -1; + + tag = h_type->tag; + sh->rg[nrg].name = NULL; + sh->rg[nrg].name_len = 0; + sh->rg[nrg].ty = h_type; + sh->rg[nrg].tag = tag; + sh->rg[nrg].id = nrg; + + while (tag) { + if (tag->str[0] == 'I' && tag->str[1] == 'D') { + if (!(sh->rg[nrg].name = malloc(tag->len))) + return -1; + strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3); + sh->rg[nrg].name[tag->len-3] = 0; + sh->rg[nrg].name_len = strlen(sh->rg[nrg].name); + } + tag = tag->next; + } + + if (sh->rg[nrg].name) { + khint_t k; + int r; + k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r); + if (-1 == r) return -1; + kh_val(sh->rg_hash, k) = nrg; + } + + sh->nrg++; + } + + /* Add to program hash? */ + if ((type>>8) == 'P' && (type&0xff) == 'G') { + SAM_hdr_tag *tag; + int npg = sh->npg; + + sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg)); + if (!sh->pg) + return -1; + + tag = h_type->tag; + sh->pg[npg].name = NULL; + sh->pg[npg].name_len = 0; + sh->pg[npg].ty = h_type; + sh->pg[npg].tag = tag; + sh->pg[npg].id = npg; + sh->pg[npg].prev_id = -1; + + while (tag) { + if (tag->str[0] == 'I' && tag->str[1] == 'D') { + if (!(sh->pg[npg].name = malloc(tag->len))) + return -1; + strncpy(sh->pg[npg].name, tag->str+3, tag->len-3); + sh->pg[npg].name[tag->len-3] = 0; + sh->pg[npg].name_len = strlen(sh->pg[npg].name); + } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { + // Resolve later if needed + khint_t k; + char tmp = tag->str[tag->len]; tag->str[tag->len] = 0; + k = kh_get(m_s2i, sh->pg_hash, tag->str+3); + tag->str[tag->len] = tmp; + + if (k != kh_end(sh->pg_hash)) { + int p_id = kh_val(sh->pg_hash, k); + sh->pg[npg].prev_id = sh->pg[p_id].id; + + /* Unmark previous entry as a PG termination */ + if (sh->npg_end > 0 && + sh->pg_end[sh->npg_end-1] == p_id) { + sh->npg_end--; + } else { + int i; + for (i = 0; i < sh->npg_end; i++) { + if (sh->pg_end[i] == p_id) { + memmove(&sh->pg_end[i], &sh->pg_end[i+1], + (sh->npg_end-i-1)*sizeof(*sh->pg_end)); + sh->npg_end--; + } + } + } + } else { + sh->pg[npg].prev_id = -1; + } + } + tag = tag->next; + } + + if (sh->pg[npg].name) { + khint_t k; + int r; + k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r); + if (-1 == r) return -1; + kh_val(sh->pg_hash, k) = npg; + } + + /* Add to npg_end[] array. Remove later if we find a PP line */ + if (sh->npg_end >= sh->npg_end_alloc) { + sh->npg_end_alloc = sh->npg_end_alloc + ? sh->npg_end_alloc*2 + : 4; + sh->pg_end = realloc(sh->pg_end, + sh->npg_end_alloc * sizeof(int)); + if (!sh->pg_end) + return -1; + } + sh->pg_end[sh->npg_end++] = npg; + + sh->npg++; + } + + return 0; +} + +/* + * Appends a formatted line to an existing SAM header. + * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with + * optional new-line. If it contains more than 1 line then multiple lines + * will be added in order. + * + * Len is the length of the text data, or 0 if unknown (in which case + * it should be null terminated). + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) { + int i, lno = 1, text_offset; + char *hdr; + + if (!len) + len = strlen(lines); + + text_offset = ks_len(&sh->text); + if (EOF == kputsn(lines, len, &sh->text)) + return -1; + hdr = ks_str(&sh->text) + text_offset; + + for (i = 0; i < len; i++) { + khint32_t type; + khint_t k; + + int l_start = i, new; + SAM_hdr_type *h_type; + SAM_hdr_tag *h_tag, *last; + + if (hdr[i] != '@') { + int j; + for (j = i; j < len && hdr[j] != '\n'; j++) + ; + sam_hdr_error("Header line does not start with '@'", + &hdr[l_start], len - l_start, lno); + return -1; + } + + type = (hdr[i+1]<<8) | hdr[i+2]; + if (hdr[i+1] < 'A' || hdr[i+1] > 'z' || + hdr[i+2] < 'A' || hdr[i+2] > 'z') { + sam_hdr_error("Header line does not have a two character key", + &hdr[l_start], len - l_start, lno); + return -1; + } + + i += 3; + if (hdr[i] == '\n') + continue; + + // Add the header line type + if (!(h_type = pool_alloc(sh->type_pool))) + return -1; + if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new))) + return -1; + + // Form the ring, either with self or other lines of this type + if (!new) { + SAM_hdr_type *t = kh_val(sh->h, k), *p; + p = t->prev; + + assert(p->next = t); + p->next = h_type; + h_type->prev = p; + + t->prev = h_type; + h_type->next = t; + h_type->order = p->order+1; + } else { + kh_val(sh->h, k) = h_type; + h_type->prev = h_type->next = h_type; + h_type->order = 0; + } + + // Parse the tags on this line + last = NULL; + if ((type>>8) == 'C' && (type&0xff) == 'O') { + int j; + if (hdr[i] != '\t') { + sam_hdr_error("Missing tab", + &hdr[l_start], len - l_start, lno); + return -1; + } + + for (j = ++i; j < len && hdr[j] != '\n'; j++) + ; + + if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool))) + return -1; + h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); + h_tag->len = j-i; + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + i = j; + + } else { + do { + int j; + if (hdr[i] != '\t') { + sam_hdr_error("Missing tab", + &hdr[l_start], len - l_start, lno); + return -1; + } + + for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++) + ; + + if (!(h_tag = pool_alloc(sh->tag_pool))) + return -1; + h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); + h_tag->len = j-i; + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + if (h_tag->len < 3 || h_tag->str[2] != ':') { + sam_hdr_error("Malformed key:value pair", + &hdr[l_start], len - l_start, lno); + return -1; + } + + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + i = j; + } while (i < len && hdr[i] != '\n'); + } + + /* Update RG/SQ hashes */ + if (-1 == sam_hdr_update_hashes(sh, type, h_type)) + return -1; + } + + return 0; +} + +/* + * Adds a single line to a SAM header. + * Specify type and one or more key,value pairs, ending with the NULL key. + * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). + * + * Returns index for specific entry on success (eg 2nd SQ, 4th RG) + * -1 on failure + */ +int sam_hdr_add(SAM_hdr *sh, const char *type, ...) { + va_list args; + va_start(args, type); + return sam_hdr_vadd(sh, type, args, NULL); +} + +int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) { + va_list args; + SAM_hdr_type *h_type; + SAM_hdr_tag *h_tag, *last; + int new; + khint32_t type_i = (type[0]<<8) | type[1], k; + +#if defined(HAVE_VA_COPY) + va_list ap_local; +#endif + + if (EOF == kputc_('@', &sh->text)) + return -1; + if (EOF == kputsn(type, 2, &sh->text)) + return -1; + + if (!(h_type = pool_alloc(sh->type_pool))) + return -1; + if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new))) + return -1; + kh_val(sh->h, k) = h_type; + + // Form the ring, either with self or other lines of this type + if (!new) { + SAM_hdr_type *t = kh_val(sh->h, k), *p; + p = t->prev; + + assert(p->next = t); + p->next = h_type; + h_type->prev = p; + + t->prev = h_type; + h_type->next = t; + h_type->order = p->order + 1; + } else { + h_type->prev = h_type->next = h_type; + h_type->order = 0; + } + + last = NULL; + + // Any ... varargs + va_start(args, ap); + for (;;) { + char *k, *v; + int idx; + + if (!(k = (char *)va_arg(args, char *))) + break; + v = va_arg(args, char *); + + if (EOF == kputc_('\t', &sh->text)) + return -1; + + if (!(h_tag = pool_alloc(sh->tag_pool))) + return -1; + idx = ks_len(&sh->text); + + if (EOF == kputs(k, &sh->text)) + return -1; + if (EOF == kputc_(':', &sh->text)) + return -1; + if (EOF == kputs(v, &sh->text)) + return -1; + + h_tag->len = ks_len(&sh->text) - idx; + h_tag->str = string_ndup(sh->str_pool, + ks_str(&sh->text) + idx, + h_tag->len); + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + } + va_end(args); + +#if defined(HAVE_VA_COPY) + va_copy(ap_local, ap); +# define ap ap_local +#endif + + // Plus the specified va_list params + for (;;) { + char *k, *v; + int idx; + + if (!(k = (char *)va_arg(ap, char *))) + break; + v = va_arg(ap, char *); + + if (EOF == kputc_('\t', &sh->text)) + return -1; + + if (!(h_tag = pool_alloc(sh->tag_pool))) + return -1; + idx = ks_len(&sh->text); + + if (EOF == kputs(k, &sh->text)) + return -1; + if (EOF == kputc_(':', &sh->text)) + return -1; + if (EOF == kputs(v, &sh->text)) + return -1; + + h_tag->len = ks_len(&sh->text) - idx; + h_tag->str = string_ndup(sh->str_pool, + ks_str(&sh->text) + idx, + h_tag->len); + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + } + va_end(ap); + + if (EOF == kputc('\n', &sh->text)) + return -1; + + int itype = (type[0]<<8) | type[1]; + if (-1 == sam_hdr_update_hashes(sh, itype, h_type)) + return -1; + + return h_type->order; +} + +/* + * Returns the first header item matching 'type'. If ID is non-NULL it checks + * for the tag ID: and compares against the specified ID. + * + * Returns NULL if no type/ID is found + */ +SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, + char *ID_key, char *ID_value) { + SAM_hdr_type *t1, *t2; + int itype = (type[0]<<8)|(type[1]); + khint_t k; + + /* Special case for types we have prebuilt hashes on */ + if (ID_key) { + if (type[0] == 'S' && type[1] == 'Q' && + ID_key[0] == 'S' && ID_key[1] == 'N') { + k = kh_get(m_s2i, hdr->ref_hash, ID_value); + return k != kh_end(hdr->ref_hash) + ? hdr->ref[kh_val(hdr->ref_hash, k)].ty + : NULL; + } + + if (type[0] == 'R' && type[1] == 'G' && + ID_key[0] == 'I' && ID_key[1] == 'D') { + k = kh_get(m_s2i, hdr->rg_hash, ID_value); + return k != kh_end(hdr->rg_hash) + ? hdr->rg[kh_val(hdr->rg_hash, k)].ty + : NULL; + } + + if (type[0] == 'P' && type[1] == 'G' && + ID_key[0] == 'I' && ID_key[1] == 'D') { + k = kh_get(m_s2i, hdr->pg_hash, ID_value); + return k != kh_end(hdr->pg_hash) + ? hdr->pg[kh_val(hdr->pg_hash, k)].ty + : NULL; + } + } + + k = kh_get(sam_hdr, hdr->h, itype); + if (k == kh_end(hdr->h)) + return NULL; + + if (!ID_key) + return kh_val(hdr->h, k); + + t1 = t2 = kh_val(hdr->h, k); + do { + SAM_hdr_tag *tag; + for (tag = t1->tag; tag; tag = tag->next) { + if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { + char *cp1 = tag->str+3; + char *cp2 = ID_value; + while (*cp1 && *cp1 == *cp2) + cp1++, cp2++; + if (*cp2 || *cp1) + continue; + return t1; + } + } + t1 = t1->next; + } while (t1 != t2); + + return NULL; +} + +/* + * As per SAM_hdr_type, but returns a complete line of formatted text + * for a specific head type/ID combination. If ID is NULL then it returns + * the first line of the specified type. + * + * The returned string is malloced and should be freed by the calling + * function with free(). + * + * Returns NULL if no type/ID is found. + */ +char *sam_hdr_find_line(SAM_hdr *hdr, char *type, + char *ID_key, char *ID_value) { + SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value); + kstring_t ks = KS_INITIALIZER; + SAM_hdr_tag *tag; + int r = 0; + + if (!ty) + return NULL; + + // Paste together the line from the hashed copy + r |= (kputc_('@', &ks) == EOF); + r |= (kputs(type, &ks) == EOF); + for (tag = ty->tag; tag; tag = tag->next) { + r |= (kputc_('\t', &ks) == EOF); + r |= (kputsn(tag->str, tag->len, &ks) == EOF); + } + + if (r) { + KS_FREE(&ks); + return NULL; + } + + return ks_str(&ks); +} + + +/* + * Looks for a specific key in a single sam header line. + * If prev is non-NULL it also fills this out with the previous tag, to + * permit use in key removal. *prev is set to NULL when the tag is the first + * key in the list. When a tag isn't found, prev (if non NULL) will be the last + * tag in the existing list. + * + * Returns the tag pointer on success + * NULL on failure + */ +SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, + SAM_hdr_type *type, + char *key, + SAM_hdr_tag **prev) { + SAM_hdr_tag *tag, *p = NULL; + + for (tag = type->tag; tag; p = tag, tag = tag->next) { + if (tag->str[0] == key[0] && tag->str[1] == key[1]) { + if (prev) + *prev = p; + return tag; + } + } + + if (prev) + *prev = p; + + return NULL; +} + + +/* + * Adds or updates tag key,value pairs in a header line. + * Eg for adding M5 tags to @SQ lines or updating sort order for the + * @HD line (although use the sam_hdr_sort_order() function for + * HD manipulation, which is a wrapper around this funuction). + * + * Specify multiple key,value pairs ending in NULL. + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) { + va_list ap; + + va_start(ap, type); + + for (;;) { + char *k, *v; + int idx; + SAM_hdr_tag *tag, *prev; + + if (!(k = (char *)va_arg(ap, char *))) + break; + v = va_arg(ap, char *); + + tag = sam_hdr_find_key(hdr, type, k, &prev); + if (!tag) { + if (!(tag = pool_alloc(hdr->tag_pool))) + return -1; + if (prev) + prev->next = tag; + else + type->tag = tag; + + tag->next = NULL; + } + + idx = ks_len(&hdr->text); + if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0) + return -1; + tag->len = ks_len(&hdr->text) - idx; + tag->str = string_ndup(hdr->str_pool, + ks_str(&hdr->text) + idx, + tag->len); + if (!tag->str) + return -1; + } + + va_end(ap); + + return 0; +} + +#define K(a) (((a)[0]<<8)|((a)[1])) + +/* + * Reconstructs the kstring from the header hash table. + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_rebuild(SAM_hdr *hdr) { + /* Order: HD then others */ + kstring_t ks = KS_INITIALIZER; + khint_t k; + + + k = kh_get(sam_hdr, hdr->h, K("HD")); + if (k != kh_end(hdr->h)) { + SAM_hdr_type *ty = kh_val(hdr->h, k); + SAM_hdr_tag *tag; + if (EOF == kputs("@HD", &ks)) + return -1; + for (tag = ty->tag; tag; tag = tag->next) { + if (EOF == kputc_('\t', &ks)) + return -1; + if (EOF == kputsn_(tag->str, tag->len, &ks)) + return -1; + } + if (EOF == kputc('\n', &ks)) + return -1; + } + + for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { + SAM_hdr_type *t1, *t2; + + if (!kh_exist(hdr->h, k)) + continue; + + if (kh_key(hdr->h, k) == K("HD")) + continue; + + t1 = t2 = kh_val(hdr->h, k); + do { + SAM_hdr_tag *tag; + char c[2]; + + if (EOF == kputc_('@', &ks)) + return -1; + c[0] = kh_key(hdr->h, k)>>8; + c[1] = kh_key(hdr->h, k)&0xff; + if (EOF == kputsn_(c, 2, &ks)) + return -1; + for (tag = t1->tag; tag; tag=tag->next) { + if (EOF == kputc_('\t', &ks)) + return -1; + if (EOF == kputsn_(tag->str, tag->len, &ks)) + return -1; + } + if (EOF == kputc('\n', &ks)) + return -1; + t1 = t1->next; + } while (t1 != t2); + } + + if (ks_str(&hdr->text)) + KS_FREE(&hdr->text); + + hdr->text = ks; + + return 0; +} + + +/* + * Creates an empty SAM header, ready to be populated. + * + * Returns a SAM_hdr struct on success (free with sam_hdr_free()) + * NULL on failure + */ +SAM_hdr *sam_hdr_new() { + SAM_hdr *sh = calloc(1, sizeof(*sh)); + + if (!sh) + return NULL; + + sh->h = kh_init(sam_hdr); + if (!sh->h) + goto err; + + sh->ID_cnt = 1; + sh->ref_count = 1; + + sh->nref = 0; + sh->ref = NULL; + if (!(sh->ref_hash = kh_init(m_s2i))) + goto err; + + sh->nrg = 0; + sh->rg = NULL; + if (!(sh->rg_hash = kh_init(m_s2i))) + goto err; + + sh->npg = 0; + sh->pg = NULL; + sh->npg_end = sh->npg_end_alloc = 0; + sh->pg_end = NULL; + if (!(sh->pg_hash = kh_init(m_s2i))) + goto err; + + KS_INIT(&sh->text); + + if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag)))) + goto err; + + if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type)))) + goto err; + + if (!(sh->str_pool = string_pool_create(8192))) + goto err; + + return sh; + + err: + if (sh->h) + kh_destroy(sam_hdr, sh->h); + + if (sh->tag_pool) + pool_destroy(sh->tag_pool); + + if (sh->type_pool) + pool_destroy(sh->type_pool); + + if (sh->str_pool) + string_pool_destroy(sh->str_pool); + + free(sh); + + return NULL; +} + + +/* + * Tokenises a SAM header into a hash table. + * Also extracts a few bits on specific data types, such as @RG lines. + * + * Returns a SAM_hdr struct on success (free with sam_hdr_free()) + * NULL on failure + */ +SAM_hdr *sam_hdr_parse(const char *hdr, int len) { + /* Make an empty SAM_hdr */ + SAM_hdr *sh; + + sh = sam_hdr_new(); + if (NULL == sh) return NULL; + + if (NULL == hdr) return sh; // empty header is permitted + + /* Parse the header, line by line */ + if (-1 == sam_hdr_add_lines(sh, hdr, len)) { + sam_hdr_free(sh); + return NULL; + } + + //sam_hdr_dump(sh); + //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL); + //sam_hdr_rebuild(sh); + //printf(">>%s<<", ks_str(sh->text)); + + //parse_references(sh); + //parse_read_groups(sh); + + sam_hdr_link_pg(sh); + //sam_hdr_dump(sh); + + return sh; +} + +/* + * Produces a duplicate copy of hdr and returns it. + * Returns NULL on failure + */ +SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { + if (-1 == sam_hdr_rebuild(hdr)) + return NULL; + + return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr)); +} + +/*! Increments a reference count on hdr. + * + * This permits multiple files to share the same header, all calling + * sam_hdr_free when done, without causing errors for other open files. + */ +void sam_hdr_incr_ref(SAM_hdr *hdr) { + hdr->ref_count++; +} + +/*! Increments a reference count on hdr. + * + * This permits multiple files to share the same header, all calling + * sam_hdr_free when done, without causing errors for other open files. + * + * If the reference count hits zero then the header is automatically + * freed. This makes it a synonym for sam_hdr_free(). + */ +void sam_hdr_decr_ref(SAM_hdr *hdr) { + sam_hdr_free(hdr); +} + +/*! Deallocates all storage used by a SAM_hdr struct. + * + * This also decrements the header reference count. If after decrementing + * it is still non-zero then the header is assumed to be in use by another + * caller and the free is not done. + * + * This is a synonym for sam_hdr_dec_ref(). + */ +void sam_hdr_free(SAM_hdr *hdr) { + if (!hdr) + return; + + if (--hdr->ref_count > 0) + return; + + if (ks_str(&hdr->text)) + KS_FREE(&hdr->text); + + if (hdr->h) + kh_destroy(sam_hdr, hdr->h); + + if (hdr->ref_hash) + kh_destroy(m_s2i, hdr->ref_hash); + + if (hdr->ref) { + int i; + for (i = 0; i < hdr->nref; i++) + if (hdr->ref[i].name) + free(hdr->ref[i].name); + free(hdr->ref); + } + + if (hdr->rg_hash) + kh_destroy(m_s2i, hdr->rg_hash); + + if (hdr->rg) { + int i; + for (i = 0; i < hdr->nrg; i++) + if (hdr->rg[i].name) + free(hdr->rg[i].name); + free(hdr->rg); + } + + if (hdr->pg_hash) + kh_destroy(m_s2i, hdr->pg_hash); + + if (hdr->pg) { + int i; + for (i = 0; i < hdr->npg; i++) + if (hdr->pg[i].name) + free(hdr->pg[i].name); + free(hdr->pg); + } + + if (hdr->pg_end) + free(hdr->pg_end); + + if (hdr->type_pool) + pool_destroy(hdr->type_pool); + + if (hdr->tag_pool) + pool_destroy(hdr->tag_pool); + + if (hdr->str_pool) + string_pool_destroy(hdr->str_pool); + + free(hdr); +} + +int sam_hdr_length(SAM_hdr *hdr) { + return ks_len(&hdr->text); +} + +char *sam_hdr_str(SAM_hdr *hdr) { + return ks_str(&hdr->text); +} + +/* + * Looks up a reference sequence by name and returns the numerical ID. + * Returns -1 if unknown reference. + */ +int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) { + khint_t k = kh_get(m_s2i, hdr->ref_hash, ref); + return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k); +} + +/* + * Looks up a read-group by name and returns a pointer to the start of the + * associated tag list. + * + * Returns NULL on failure + */ +SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) { + khint_t k = kh_get(m_s2i, hdr->rg_hash, rg); + return k == kh_end(hdr->rg_hash) + ? NULL + : &hdr->rg[kh_val(hdr->rg_hash, k)]; +} + + +/* + * Fixes any PP links in @PG headers. + * If the entries are in order then this doesn't need doing, but incase + * our header is out of order this goes through the sh->pg[] array + * setting the prev_id field. + * + * Note we can have multiple complete chains. This code should identify the + * tails of these chains as these are the entries we have to link to in + * subsequent PP records. + * + * Returns 0 on sucess + * -1 on failure (indicating broken PG/PP records) + */ +int sam_hdr_link_pg(SAM_hdr *hdr) { + int i, j, ret = 0; + + hdr->npg_end_alloc = hdr->npg; + hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end)); + if (!hdr->pg_end) + return -1; + + for (i = 0; i < hdr->npg; i++) + hdr->pg_end[i] = i; + + for (i = 0; i < hdr->npg; i++) { + khint_t k; + SAM_hdr_tag *tag; + char tmp; + + for (tag = hdr->pg[i].tag; tag; tag = tag->next) { + if (tag->str[0] == 'P' && tag->str[1] == 'P') + break; + } + if (!tag) { + /* Chain start points */ + continue; + } + + tmp = tag->str[tag->len]; tag->str[tag->len] = 0; + k = kh_get(m_s2i, hdr->pg_hash, tag->str+3); + tag->str[tag->len] = tmp; + + if (k == kh_end(hdr->pg_hash)) { + ret = -1; + continue; + } + + hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id; + hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1; + } + + for (i = j = 0; i < hdr->npg; i++) { + if (hdr->pg_end[i] != -1) + hdr->pg_end[j++] = hdr->pg_end[i]; + } + hdr->npg_end = j; + + return ret; +} + +/* + * Returns a unique ID from a base name. + * + * The value returned is valid until the next call to + * this function. + */ +const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) { + khint_t k = kh_get(m_s2i, sh->pg_hash, name); + if (k == kh_end(sh->pg_hash)) + return name; + + do { + sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++); + k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf); + } while (k == kh_end(sh->pg_hash)); + + return sh->ID_buf; +} + +/* + * Add an @PG line. + * + * If we wish complete control over this use sam_hdr_add() directly. This + * function uses that, but attempts to do a lot of tedious house work for + * you too. + * + * - It will generate a suitable ID if the supplied one clashes. + * - It will generate multiple @PG records if we have multiple PG chains. + * + * Call it as per sam_hdr_add() with a series of key,value pairs ending + * in NULL. + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) { + va_list args; + va_start(args, name); + + if (sh->npg_end) { + /* Copy ends array to avoid us looping while modifying it */ + int *end = malloc(sh->npg_end * sizeof(int)); + int i, nends = sh->npg_end; + + if (!end) + return -1; + + memcpy(end, sh->pg_end, nends * sizeof(*end)); + + for (i = 0; i < nends; i++) { + if (-1 == sam_hdr_vadd(sh, "PG", args, + "ID", sam_hdr_PG_ID(sh, name), + "PN", name, + "PP", sh->pg[end[i]].name, + NULL)) { + free(end); + return -1; + } + } + + free(end); + } else { + if (-1 == sam_hdr_vadd(sh, "PG", args, + "ID", sam_hdr_PG_ID(sh, name), + "PN", name, + NULL)) + return -1; + } + + //sam_hdr_dump(sh); + + return 0; +} + +/* + * A function to help with construction of CL tags in @PG records. + * Takes an argc, argv pair and returns a single space-separated string. + * This string should be deallocated by the calling function. + * + * Returns malloced char * on success + * NULL on failure + */ +char *stringify_argv(int argc, char *argv[]) { + char *str, *cp; + size_t nbytes = 1; + int i, j; + + /* Allocate */ + for (i = 0; i < argc; i++) { + nbytes += strlen(argv[i]) + 1; + } + if (!(str = malloc(nbytes))) + return NULL; + + /* Copy */ + cp = str; + for (i = 0; i < argc; i++) { + j = 0; + while (argv[i][j]) { + if (argv[i][j] == '\t') + *cp++ = ' '; + else + *cp++ = argv[i][j]; + j++; + } + *cp++ = ' '; + } + *cp++ = 0; + + return str; +} diff --git a/star-sys/STAR/source/htslib/cram/sam_header.h b/star-sys/STAR/source/htslib/cram/sam_header.h new file mode 100644 index 0000000..b9ea298 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/sam_header.h @@ -0,0 +1,452 @@ +/* +Copyright (c) 2013-2014 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * SAM header parsing. + * + * These functions can be shared between SAM, BAM and CRAM file + * formats as all three internally use the same string encoding for + * header fields. + * + * Consider using the scram() generic API and calling + * scram_get_header() to obtain the format-specific pointer to the + * SAM_hdr struct. + */ + +/* + * TODO. + * + * - Sort order (parse to struct, enum type, updating funcs) + * - Removal of lines. + * - Updating of lines + */ + +#ifndef _SAM_HDR_H_ +#define _SAM_HDR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include + +#include "cram/string_alloc.h" +#include "cram/pooled_alloc.h" + +#include "htslib/khash.h" +#include "htslib/kstring.h" + +// For structure assignment. Eg kstring_t s = KS_INITIALIZER; +#define KS_INITIALIZER {0,0,0} + +// For initialisation elsewhere. Eg KS_INIT(x->str); +#define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL) + +// Frees the string subfield only. Assumes 's' itself is static. +#define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0) + +/* + * Proposed new SAM header parsing + +1 @SQ ID:foo LN:100 +2 @SQ ID:bar LN:200 +3 @SQ ID:ram LN:300 UR:xyz +4 @RG ID:r ... +5 @RG ID:s ... + +Hash table for 2-char @keys without dup entries. +If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. + +HASH("SQ")--\ + | + (3) <-> 1 <-> 2 <-> 3 <-> (1) + +HASH("RG")--\ + | + (5) <-> 4 <-> 5 <-> (4) + +Items stored in the hash values also form their own linked lists: +Ie SQ->ID(foo)->LN(100) + SQ->ID(bar)->LN(200) + SQ->ID(ram)->LN(300)->UR(xyz) + RG->ID(r) + */ + +/*! A single key:value pair on a header line + * + * These form a linked list and hold strings. The strings are + * allocated from a string_alloc_t pool referenced in the master + * SAM_hdr structure. Do not attempt to free, malloc or manipulate + * these strings directly. + */ +typedef struct SAM_hdr_tag_s { + struct SAM_hdr_tag_s *next; + char *str; + int len; +} SAM_hdr_tag; + +/*! The parsed version of the SAM header string. + * + * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type + * struct via the main hash table h in the SAM_hdr struct. + * + * These in turn consist of circular bi-directional linked lists (ie + * rings) to hold the multiple instances of the same header type + * code. For example if we have 5 \@SQ lines the primary hash table + * will key on \@SQ pointing to the first SAM_hdr_type and that in turn + * will be part of a ring of 5 elements. + * + * For each SAM_hdr_type structure we also point to a SAM_hdr_tag + * structure which holds the tokenised attributes; the tab separated + * key:value pairs per line. + */ +typedef struct SAM_hdr_item_s { + struct SAM_hdr_item_s *next; // cirular + struct SAM_hdr_item_s *prev; + SAM_hdr_tag *tag; // first tag + int order; // 0 upwards +} SAM_hdr_type; + +/*! Parsed \@SQ lines */ +typedef struct { + char *name; + uint32_t len; + SAM_hdr_type *ty; + SAM_hdr_tag *tag; +} SAM_SQ; + +/*! Parsed \@RG lines */ +typedef struct { + char *name; + SAM_hdr_type *ty; + SAM_hdr_tag *tag; + int name_len; + int id; // numerical ID +} SAM_RG; + +/*! Parsed \@PG lines */ +typedef struct { + char *name; + SAM_hdr_type *ty; + SAM_hdr_tag *tag; + int name_len; + int id; // numerical ID + int prev_id; // -1 if none +} SAM_PG; + +KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*) +KHASH_MAP_INIT_STR(m_s2i, int) + +/*! Primary structure for header manipulation + * + * The initial header text is held in the text kstring_t, but is also + * parsed out into SQ, RG and PG arrays. These have a hash table + * associated with each to allow lookup by ID or SN fields instead of + * their numeric array indices. Additionally PG has an array to hold + * the linked list start points (the last in a PP chain). + * + * Use the appropriate sam_hdr_* functions to edit the header, and + * call sam_hdr_rebuild() any time the textual form needs to be + * updated again. + */ +typedef struct { + kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag + khash_t(sam_hdr) *h; + string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings + pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs + pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs + + // @SQ lines / references + int nref; //!< Number of \@SQ lines + SAM_SQ *ref; //!< Array of parsed \@SQ lines + khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index + + // @RG lines / read-groups + int nrg; //!< Number of \@RG lines + SAM_RG *rg; //!< Array of parsed \@RG lines + khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index + + // @PG lines / programs + int npg; //!< Number of \@PG lines + int npg_end; //!< Number of terminating \@PG lines + int npg_end_alloc; //!< Size of pg_end field + SAM_PG *pg; //!< Array of parsed \@PG lines + khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index + int *pg_end; //!< \@PG chain termination IDs + + // @cond internal + char ID_buf[1024]; // temporary buffer + int ID_cnt; + int ref_count; // number of uses of this SAM_hdr + // @endcond +} SAM_hdr; + +/*! Creates an empty SAM header, ready to be populated. + * + * @return + * Returns a SAM_hdr struct on success (free with sam_hdr_free()) + * NULL on failure + */ +SAM_hdr *sam_hdr_new(void); + +/*! Tokenises a SAM header into a hash table. + * + * Also extracts a few bits on specific data types, such as @RG lines. + * + * @return + * Returns a SAM_hdr struct on success (free with sam_hdr_free()); + * NULL on failure + */ +#ifdef SAMTOOLS +SAM_hdr *sam_hdr_parse_(const char *hdr, int len); +#else +SAM_hdr *sam_hdr_parse(const char *hdr, int len); +#endif + + +/*! Produces a duplicate copy of hdr and returns it. + * @return + * Returns NULL on failure + */ +SAM_hdr *sam_hdr_dup(SAM_hdr *hdr); + + +/*! Increments a reference count on hdr. + * + * This permits multiple files to share the same header, all calling + * sam_hdr_free when done, without causing errors for other open files. + */ +void sam_hdr_incr_ref(SAM_hdr *hdr); + + +/*! Increments a reference count on hdr. + * + * This permits multiple files to share the same header, all calling + * sam_hdr_free when done, without causing errors for other open files. + * + * If the reference count hits zero then the header is automatically + * freed. This makes it a synonym for sam_hdr_free(). + */ +void sam_hdr_decr_ref(SAM_hdr *hdr); + + +/*! Deallocates all storage used by a SAM_hdr struct. + * + * This also decrements the header reference count. If after decrementing + * it is still non-zero then the header is assumed to be in use by another + * caller and the free is not done. + * + * This is a synonym for sam_hdr_dec_ref(). + */ +void sam_hdr_free(SAM_hdr *hdr); + +/*! Returns the current length of the SAM_hdr in text form. + * + * Call sam_hdr_rebuild() first if editing has taken place. + */ +int sam_hdr_length(SAM_hdr *hdr); + +/*! Returns the string form of the SAM_hdr. + * + * Call sam_hdr_rebuild() first if editing has taken place. + */ +char *sam_hdr_str(SAM_hdr *hdr); + +/*! Appends a formatted line to an existing SAM header. + * + * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with + * optional new-line. If it contains more than 1 line then multiple lines + * will be added in order. + * + * Len is the length of the text data, or 0 if unknown (in which case + * it should be null terminated). + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len); + +/*! Adds a single line to a SAM header. + * + * Specify type and one or more key,value pairs, ending with the NULL key. + * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_add(SAM_hdr *sh, const char *type, ...); + +/*! Adds a single line to a SAM header. + * + * This is much like sam_hdr_add() but with the additional va_list + * argument. This is followed by specifying type and one or more + * key,value pairs, ending with the NULL key. + * + * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). + * + * The purpose of the additional va_list parameter is to permit other + * varargs functions to call this while including their own additional + * parameters; an example is in sam_hdr_add_PG(). + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...); + +/*! + * @return + * Returns the first header item matching 'type'. If ID is non-NULL it checks + * for the tag ID: and compares against the specified ID. + * + * Returns NULL if no type/ID is found + */ +SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, + char *ID_key, char *ID_value); + +/*! + * + * As per SAM_hdr_type, but returns a complete line of formatted text + * for a specific head type/ID combination. If ID is NULL then it returns + * the first line of the specified type. + * + * The returned string is malloced and should be freed by the calling + * function with free(). + * + * @return + * Returns NULL if no type/ID is found. + */ +char *sam_hdr_find_line(SAM_hdr *hdr, char *type, + char *ID_key, char *ID_value); + +/*! Looks for a specific key in a single sam header line. + * + * If prev is non-NULL it also fills this out with the previous tag, to + * permit use in key removal. *prev is set to NULL when the tag is the first + * key in the list. When a tag isn't found, prev (if non NULL) will be the last + * tag in the existing list. + * + * @return + * Returns the tag pointer on success; + * NULL on failure + */ +SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, + SAM_hdr_type *type, + char *key, + SAM_hdr_tag **prev); + +/*! Adds or updates tag key,value pairs in a header line. + * + * Eg for adding M5 tags to @SQ lines or updating sort order for the + * @HD line (although use the sam_hdr_sort_order() function for + * HD manipulation, which is a wrapper around this funuction). + * + * Specify multiple key,value pairs ending in NULL. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...); + +/*! Reconstructs the kstring from the header hash table. + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_rebuild(SAM_hdr *hdr); + +/*! Looks up a reference sequence by name and returns the numerical ID. + * @return + * Returns -1 if unknown reference. + */ +int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref); + +/*! Looks up a read-group by name and returns a pointer to the start of the + * associated tag list. + * + * @return + * Returns NULL on failure + */ +SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg); + +/*! Fixes any PP links in @PG headers. + * + * If the entries are in order then this doesn't need doing, but incase + * our header is out of order this goes through the sh->pg[] array + * setting the prev_id field. + * + * @return + * Returns 0 on sucess; + * -1 on failure (indicating broken PG/PP records) + */ +int sam_hdr_link_pg(SAM_hdr *hdr); + + +/*! Add an @PG line. + * + * If we wish complete control over this use sam_hdr_add() directly. This + * function uses that, but attempts to do a lot of tedious house work for + * you too. + * + * - It will generate a suitable ID if the supplied one clashes. + * - It will generate multiple @PG records if we have multiple PG chains. + * + * Call it as per sam_hdr_add() with a series of key,value pairs ending + * in NULL. + * + * @return + * Returns 0 on success; + * -1 on failure + */ +int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); + +/*! + * A function to help with construction of CL tags in @PG records. + * Takes an argc, argv pair and returns a single space-separated string. + * This string should be deallocated by the calling function. + * + * @return + * Returns malloced char * on success; + * NULL on failure + */ +char *stringify_argv(int argc, char *argv[]); + +#ifdef __cplusplus +} +#endif + +#endif /* _SAM_HDR_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/string_alloc.c b/star-sys/STAR/source/htslib/cram/string_alloc.c new file mode 100644 index 0000000..d543e14 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/string_alloc.c @@ -0,0 +1,153 @@ +/* +Copyright (c) 2010 Genome Research Ltd. +Author: Andrew Whitwham + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +/* + A pooled string allocator intended to cut down on the + memory overhead of many small string allocations. + + Andrew Whitwham, September 2010. +*/ + +#include +#include +#include + +#include "string_alloc.h" + +#define MIN_STR_SIZE 1024 + + +/* creates the string pool. max_length is the initial size + a single string can be. Tha max_length can grow as + needed */ + +string_alloc_t *string_pool_create(size_t max_length) { + string_alloc_t *a_str; + + if (NULL == (a_str = (string_alloc_t *)malloc(sizeof(*a_str)))) { + return NULL; + } + + if (max_length < MIN_STR_SIZE) max_length = MIN_STR_SIZE; + + a_str->nstrings = 0; + a_str->max_length = max_length; + a_str->strings = NULL; + + return a_str; +} + + +/* internal function to do the actual memory allocation */ + +static string_t *new_string_pool(string_alloc_t *a_str) { + string_t *str; + + str = realloc(a_str->strings, (a_str->nstrings + 1) * sizeof(*a_str->strings)); + + if (NULL == str) return NULL; + + a_str->strings = str; + str = &a_str->strings[a_str->nstrings]; + + str->str = malloc(a_str->max_length);; + + if (NULL == str->str) return NULL; + + str->used = 0; + a_str->nstrings++; + + return str; +} + + +/* free allocated memory */ + +void string_pool_destroy(string_alloc_t *a_str) { + size_t i; + + for (i = 0; i < a_str->nstrings; i++) { + free(a_str->strings[i].str); + } + + free(a_str->strings); + free(a_str); +} + + +/* allocate space for a string */ + +char *string_alloc(string_alloc_t *a_str, size_t length) { + string_t *str; + char *ret; + + if (length <= 0) return NULL; + + // add to last string pool if we have space + if (a_str->nstrings) { + str = &a_str->strings[a_str->nstrings - 1]; + + if (str->used + length < a_str->max_length) { + ret = str->str + str->used; + str->used += length; + return ret; + } + } + + // increase the max length if needs be + if (length > a_str->max_length) a_str->max_length = length; + + // need a new string pool + str = new_string_pool(a_str); + + if (NULL == str) return NULL; + + str->used = length; + return str->str; +} + + +/* equivalent to strdup */ + +char *string_dup(string_alloc_t *a_str, char *instr) { + return string_ndup(a_str, instr, strlen(instr)); +} + +char *string_ndup(string_alloc_t *a_str, char *instr, size_t len) { + char *str = string_alloc(a_str, len + 1); + + if (NULL == str) return NULL; + + strncpy(str, instr, len); + str[len] = 0; + + return str; +} diff --git a/star-sys/STAR/source/htslib/cram/string_alloc.h b/star-sys/STAR/source/htslib/cram/string_alloc.h new file mode 100644 index 0000000..71ae26d --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/string_alloc.h @@ -0,0 +1,69 @@ +/* +Copyright (c) 2010 Genome Research Ltd. +Author: Andrew Whitwham + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _STRING_ALLOC_H_ +#define _STRING_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * A pooled string allocator intended to cut down on the + * memory overhead of many small string allocations. + * + * Andrew Whitwham, September 2010. + */ + +typedef struct { + char *str; + size_t used; +} string_t; + +typedef struct { + size_t max_length; + size_t nstrings; + string_t *strings; +} string_alloc_t; + +string_alloc_t *string_pool_create(size_t max_length); +void string_pool_destroy(string_alloc_t *a_str); +char *string_alloc(string_alloc_t *a_str, size_t length); +char *string_dup(string_alloc_t *a_str, char *instr); +char *string_ndup(string_alloc_t *a_str, char *instr, size_t len); + +#endif + +#ifdef __cplusplus +} +#endif + diff --git a/star-sys/STAR/source/htslib/cram/thread_pool.c b/star-sys/STAR/source/htslib/cram/thread_pool.c new file mode 100644 index 0000000..90652a7 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/thread_pool.c @@ -0,0 +1,713 @@ +/* +Copyright (c) 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include + +#include +#include +#include +#include +#include + +#include "cram/thread_pool.h" + +//#define DEBUG +#define DEBUG_TIME + +#ifdef DEBUG +static int worker_id(t_pool *p) { + int i; + pthread_t s = pthread_self(); + for (i = 0; i < p->tsize; i++) { + if (pthread_equal(s, p->t[i])) + return i; + } + return -1; +} +#endif + +/* ---------------------------------------------------------------------------- + * A queue to hold results from the thread pool. + * + * Each thread pool may have jobs of multiple types being queued up and + * interleaved, so we allow several results queue per pool. + * + * The jobs themselves are expected to push their results onto their + * appropriate results queue. + */ + +/* + * Adds a result to the end of the result queue. + * + * Returns 0 on success; + * -1 on failure + */ +static int t_pool_add_result(t_pool_job *j, void *data) { + t_results_queue *q = j->q; + t_pool_result *r; + +#ifdef DEBUG + fprintf(stderr, "%d: Adding resulting to queue %p, serial %d\n", + worker_id(j->p), q, j->serial); +#endif + + /* No results queue is fine if we don't want any results back */ + if (!q) + return 0; + + if (!(r = malloc(sizeof(*r)))) + return -1; + + r->next = NULL; + r->data = data; + r->serial = j->serial; + + pthread_mutex_lock(&q->result_m); + if (q->result_tail) { + q->result_tail->next = r; + q->result_tail = r; + } else { + q->result_head = q->result_tail = r; + } + q->queue_len++; + q->pending--; + +#ifdef DEBUG + fprintf(stderr, "%d: Broadcasting result_avail (id %d)\n", + worker_id(j->p), r->serial); +#endif + pthread_cond_broadcast(&q->result_avail_c); +#ifdef DEBUG + fprintf(stderr, "%d: Broadcast complete\n", worker_id(j->p)); +#endif + + pthread_mutex_unlock(&q->result_m); + + return 0; +} + +/* Core of t_pool_next_result() */ +static t_pool_result *t_pool_next_result_locked(t_results_queue *q) { + t_pool_result *r, *last; + + for (last = NULL, r = q->result_head; r; last = r, r = r->next) { + if (r->serial == q->next_serial) + break; + } + + if (r) { + if (q->result_head == r) + q->result_head = r->next; + else + last->next = r->next; + + if (q->result_tail == r) + q->result_tail = last; + + if (!q->result_head) + q->result_tail = NULL; + + q->next_serial++; + q->queue_len--; + } + + return r; +} + +/* + * Pulls a result off the head of the result queue. Caller should + * free it (and any internals as appropriate) after use. This doesn't + * wait for a result to be present. + * + * Results will be returned in strict order. + * + * Returns t_pool_result pointer if a result is ready. + * NULL if not. + */ +t_pool_result *t_pool_next_result(t_results_queue *q) { + t_pool_result *r; + +#ifdef DEBUG + fprintf(stderr, "Requesting next result on queue %p\n", q); +#endif + + pthread_mutex_lock(&q->result_m); + r = t_pool_next_result_locked(q); + pthread_mutex_unlock(&q->result_m); + +#ifdef DEBUG + fprintf(stderr, "(q=%p) Found %p\n", q, r); +#endif + + return r; +} + +t_pool_result *t_pool_next_result_wait(t_results_queue *q) { + t_pool_result *r; + +#ifdef DEBUG + fprintf(stderr, "Waiting for result %d...\n", q->next_serial); +#endif + + pthread_mutex_lock(&q->result_m); + while (!(r = t_pool_next_result_locked(q))) { + /* Possible race here now avoided via _locked() call, but incase... */ + struct timeval now; + struct timespec timeout; + + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + 10; + timeout.tv_nsec = now.tv_usec * 1000; + + pthread_cond_timedwait(&q->result_avail_c, &q->result_m, &timeout); + } + pthread_mutex_unlock(&q->result_m); + + return r; +} + +/* + * Returns true if there are no items on the finished results queue and + * also none still pending. + */ +int t_pool_results_queue_empty(t_results_queue *q) { + int empty; + + pthread_mutex_lock(&q->result_m); + empty = q->queue_len == 0 && q->pending == 0; + pthread_mutex_unlock(&q->result_m); + + return empty; +} + + +/* + * Returns the number of completed jobs on the results queue. + */ +int t_pool_results_queue_len(t_results_queue *q) { + int len; + + pthread_mutex_lock(&q->result_m); + len = q->queue_len; + pthread_mutex_unlock(&q->result_m); + + return len; +} + +int t_pool_results_queue_sz(t_results_queue *q) { + int len; + + pthread_mutex_lock(&q->result_m); + len = q->queue_len + q->pending; + pthread_mutex_unlock(&q->result_m); + + return len; +} + +/* + * Frees a result 'r' and if free_data is true also frees + * the internal r->data result too. + */ +void t_pool_delete_result(t_pool_result *r, int free_data) { + if (!r) + return; + + if (free_data && r->data) + free(r->data); + + free(r); +} + +/* + * Initialises a results queue. + * + * Results queue pointer on success; + * NULL on failure + */ +t_results_queue *t_results_queue_init(void) { + t_results_queue *q = malloc(sizeof(*q)); + + pthread_mutex_init(&q->result_m, NULL); + pthread_cond_init(&q->result_avail_c, NULL); + + q->result_head = NULL; + q->result_tail = NULL; + q->next_serial = 0; + q->curr_serial = 0; + q->queue_len = 0; + q->pending = 0; + + return q; +} + +/* Deallocates memory for a results queue */ +void t_results_queue_destroy(t_results_queue *q) { +#ifdef DEBUG + fprintf(stderr, "Destroying results queue %p\n", q); +#endif + + if (!q) + return; + + pthread_mutex_destroy(&q->result_m); + pthread_cond_destroy(&q->result_avail_c); + + memset(q, 0xbb, sizeof(*q)); + free(q); + +#ifdef DEBUG + fprintf(stderr, "Destroyed results queue %p\n", q); +#endif +} + +/* ---------------------------------------------------------------------------- + * The thread pool. + */ + +#define TDIFF(t2,t1) ((t2.tv_sec-t1.tv_sec)*1000000 + t2.tv_usec-t1.tv_usec) + +/* + * A worker thread. + * + * Each thread waits for the pool to be non-empty. + * As soon as this applies, one of them succeeds in getting the lock + * and then executes the job. + */ +static void *t_pool_worker(void *arg) { + t_pool *p = (t_pool *)arg; + t_pool_job *j; +#ifdef DEBUG_TIME + struct timeval t1, t2, t3; +#endif + + for (;;) { + // Pop an item off the pool queue +#ifdef DEBUG_TIME + gettimeofday(&t1, NULL); +#endif + + pthread_mutex_lock(&p->pool_m); + +#ifdef DEBUG_TIME + gettimeofday(&t2, NULL); + p->wait_time += TDIFF(t2,t1); +#endif + + p->nwaiting++; + while (!p->head && !p->shutdown) { + if (p->njobs == 0) + pthread_cond_signal(&p->empty_c); +#ifdef DEBUG_TIME + gettimeofday(&t2, NULL); +#endif + + pthread_cond_wait(&p->pending_c, &p->pool_m); + +#ifdef DEBUG_TIME + gettimeofday(&t3, NULL); + p->wait_time += TDIFF(t3,t2); +#endif + } + + p->nwaiting--; + + if (p->shutdown) { + p->total_time += TDIFF(t3,t1); +#ifdef DEBUG + fprintf(stderr, "%d: Shutting down\n", worker_id(p)); +#endif + pthread_mutex_unlock(&p->pool_m); + pthread_exit(NULL); + } + + j = p->head; + if (!(p->head = j->next)) + p->tail = NULL; + + if (p->njobs-- == p->qsize) + pthread_cond_signal(&p->full_c); + + if (p->njobs == 0) + pthread_cond_signal(&p->empty_c); + + pthread_mutex_unlock(&p->pool_m); + + // We have job 'j' - now execute it. + t_pool_add_result(j, j->func(j->arg)); +#ifdef DEBUG_TIME + pthread_mutex_lock(&p->pool_m); + gettimeofday(&t3, NULL); + p->total_time += TDIFF(t3,t1); + pthread_mutex_unlock(&p->pool_m); +#endif + memset(j, 0xbb, sizeof(*j)); + free(j); + } + + return NULL; +} + +/* + * Creates a worker pool of length qsize with tsize worker threads. + * + * Returns pool pointer on success; + * NULL on failure + */ +t_pool *t_pool_init(int qsize, int tsize) { + int i; + t_pool *p = malloc(sizeof(*p)); + p->qsize = qsize; + p->tsize = tsize; + p->njobs = 0; + p->nwaiting = 0; + p->shutdown = 0; + p->head = p->tail = NULL; +#ifdef DEBUG_TIME + p->total_time = p->wait_time = 0; +#endif + + p->t = malloc(tsize * sizeof(p->t[0])); + + pthread_mutex_init(&p->pool_m, NULL); + pthread_cond_init(&p->empty_c, NULL); + pthread_cond_init(&p->pending_c, NULL); + pthread_cond_init(&p->full_c, NULL); + + for (i = 0; i < tsize; i++) { + if (0 != pthread_create(&p->t[i], NULL, t_pool_worker, p)) + return NULL; + } + + return p; +} + +/* + * Adds an item to the work pool. + * + * FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs + * result returned. Ie rather than blocking on full queue we're permitted + * to return early on "result available" event too. + * Caller would then have a while loop around t_pool_dispatch. + * Or, return -1 and set errno to EAGAIN to indicate job not yet submitted. + * + * Returns 0 on success + * -1 on failure + */ +int t_pool_dispatch(t_pool *p, t_results_queue *q, + void *(*func)(void *arg), void *arg) { + t_pool_job *j = malloc(sizeof(*j)); + + if (!j) + return -1; + j->func = func; + j->arg = arg; + j->next = NULL; + j->p = p; + j->q = q; + if (q) { + pthread_mutex_lock(&q->result_m); + j->serial = q->curr_serial++; + q->pending++; + pthread_mutex_unlock(&q->result_m); + } else { + j->serial = 0; + } + +#ifdef DEBUG + fprintf(stderr, "Dispatching job %p for queue %p, serial %d\n", j, q, j->serial); +#endif + + pthread_mutex_lock(&p->pool_m); + + // Check if queue is full + while (p->njobs == p->qsize) + pthread_cond_wait(&p->full_c, &p->pool_m); + + p->njobs++; + + if (p->tail) { + p->tail->next = j; + p->tail = j; + } else { + p->head = p->tail = j; + } + + if (p->njobs == 1) { + // First job => tell all worker threads to start up + pthread_cond_broadcast(&p->pending_c); + } + + pthread_mutex_unlock(&p->pool_m); + +#ifdef DEBUG + fprintf(stderr, "Dispatched (serial %d)\n", j->serial); +#endif + + return 0; +} + +/* + * As above but optional non-block flag. + * + * nonblock 0 => block if input queue is full + * nonblock +1 => don't block if input queue is full, but do not add task + * nonblock -1 => add task regardless of whether queue is full (over-size) + */ +int t_pool_dispatch2(t_pool *p, t_results_queue *q, + void *(*func)(void *arg), void *arg, int nonblock) { + t_pool_job *j = malloc(sizeof(*j)); + + if (!j) + return -1; + j->func = func; + j->arg = arg; + j->next = NULL; + j->p = p; + j->q = q; + if (q) { + pthread_mutex_lock(&q->result_m); + j->serial = q->curr_serial; + pthread_mutex_unlock(&q->result_m); + } else { + j->serial = 0; + } + +#ifdef DEBUG + fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, j->serial); +#endif + + pthread_mutex_lock(&p->pool_m); + + if (p->njobs == p->qsize && nonblock == 1) { + pthread_mutex_unlock(&p->pool_m); + errno = EAGAIN; + free(j); + return -1; + } + + if (q) { + pthread_mutex_lock(&q->result_m); + q->curr_serial++; + q->pending++; + pthread_mutex_unlock(&q->result_m); + } + + // Check if queue is full + if (nonblock == 0) + while (p->njobs == p->qsize) + pthread_cond_wait(&p->full_c, &p->pool_m); + + p->njobs++; + +// if (q->curr_serial % 100 == 0) +// fprintf(stderr, "p->njobs = %d p->qsize = %d\n", p->njobs, p->qsize); + + if (p->tail) { + p->tail->next = j; + p->tail = j; + } else { + p->head = p->tail = j; + } + +#ifdef DEBUG + fprintf(stderr, "Dispatched (serial %d)\n", j->serial); +#endif + + if (p->njobs == 1) { + // First job => tell all worker threads to start up + pthread_cond_broadcast(&p->pending_c); + } + + pthread_mutex_unlock(&p->pool_m); + + return 0; +} + +/* + * Flushes the pool, but doesn't exit. This simply drains the queue and + * ensures all worker threads have finished their current task. + * + * Returns 0 on success; + * -1 on failure + */ +int t_pool_flush(t_pool *p) { +#ifdef DEBUG + fprintf(stderr, "Flushing pool %p\n", p); +#endif + + // Drains the queue + pthread_mutex_lock(&p->pool_m); + while (p->njobs || p->nwaiting != p->tsize) + pthread_cond_wait(&p->empty_c, &p->pool_m); + + pthread_mutex_unlock(&p->pool_m); + +#ifdef DEBUG + fprintf(stderr, "Flushed complete for pool %p, njobs=%d, nwaiting=%d\n", + p, p->njobs, p->nwaiting); +#endif + + return 0; +} + +/* + * Destroys a thread pool. If 'kill' is true the threads are terminated now, + * otherwise they are joined into the main thread so they will finish their + * current work load. + * + * Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or + * t_pool_destroy(p,1) to quickly exit after a fatal error. + */ +void t_pool_destroy(t_pool *p, int kill) { + int i; + +#ifdef DEBUG + fprintf(stderr, "Destroying pool %p, kill=%d\n", p, kill); +#endif + + /* Send shutdown message to worker threads */ + if (!kill) { + pthread_mutex_lock(&p->pool_m); + p->shutdown = 1; + +#ifdef DEBUG + fprintf(stderr, "Sending shutdown request\n"); +#endif + + pthread_cond_broadcast(&p->pending_c); + pthread_mutex_unlock(&p->pool_m); + +#ifdef DEBUG + fprintf(stderr, "Shutdown complete\n"); +#endif + for (i = 0; i < p->tsize; i++) + pthread_join(p->t[i], NULL); + } else { + for (i = 0; i < p->tsize; i++) + pthread_kill(p->t[i], SIGINT); + } + + pthread_mutex_destroy(&p->pool_m); + pthread_cond_destroy(&p->empty_c); + pthread_cond_destroy(&p->pending_c); + pthread_cond_destroy(&p->full_c); + +#ifdef DEBUG_TIME + fprintf(stderr, "Total time=%f\n", p->total_time / 1000000.0); + fprintf(stderr, "Wait time=%f\n", p->wait_time / 1000000.0); + fprintf(stderr, "%d%% utilisation\n", + (int)(100 - ((100.0 * p->wait_time) / p->total_time + 0.5))); +#endif + + free(p->t); + free(p); + +#ifdef DEBUG + fprintf(stderr, "Destroyed pool %p\n", p); +#endif +} + + +/*----------------------------------------------------------------------------- + * Test app. + */ + +#ifdef TEST_MAIN + +#include +#include + +void *doit(void *arg) { + int i, k, x = 0; + int job = *(int *)arg; + int *res; + + printf("Worker: execute job %d\n", job); + + usleep(random() % 1000000); // to coerce job completion out of order + if (0) { + for (k = 0; k < 100; k++) { + for (i = 0; i < 100000; i++) { + x++; + x += x * sin(i); + x += x * cos(x); + } + } + x *= 100; + x += job; + } else { + x = job*job; + } + + printf("Worker: job %d terminating, x=%d\n", job, x); + + free(arg); + + res = malloc(sizeof(*res)); + *res = x; + + return res; +} + +#define NTHREADS 8 + +int main(int argc, char **argv) { + t_pool *p = t_pool_init(NTHREADS*2, NTHREADS); + t_results_queue *q = t_results_queue_init(); + int i; + t_pool_result *r; + + // Dispatch jobs + for (i = 0; i < 20; i++) { + int *ip = malloc(sizeof(*ip)); + *ip = i; + printf("Submitting %d\n", i); + t_pool_dispatch(p, q, doit, ip); + + // Check for results + if ((r = t_pool_next_result(q))) { + printf("RESULT: %d\n", *(int *)r->data); + t_pool_delete_result(r, 1); + } + } + + t_pool_flush(p); + + while ((r = t_pool_next_result(q))) { + printf("RESULT: %d\n", *(int *)r->data); + t_pool_delete_result(r, 1); + } + + t_pool_destroy(p, 0); + t_results_queue_destroy(q); + + return 0; +} +#endif diff --git a/star-sys/STAR/source/htslib/cram/thread_pool.h b/star-sys/STAR/source/htslib/cram/thread_pool.h new file mode 100644 index 0000000..18e8b42 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/thread_pool.h @@ -0,0 +1,197 @@ +/* +Copyright (c) 2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This file implements a thread pool for multi-threading applications. + * It consists of two distinct interfaces: thread pools an results queues. + * + * The pool of threads is given a function pointer and void* data to pass in. + * This means the pool can run jobs of multiple types, albeit first come + * first served with no job scheduling. + * + * Upon completion, the return value from the function pointer is added to + * a results queue. We may have multiple queues in use for the one pool. + * + * An example: reading from BAM and writing to CRAM with 10 threads. We'll + * have a pool of 10 threads and two results queues holding decoded BAM blocks + * and encoded CRAM blocks respectively. + */ + +#ifndef _THREAD_POOL_H_ +#define _THREAD_POOL_H_ + +#include + +struct t_pool; +struct t_results_queue; + +typedef struct t_pool_job { + void *(*func)(void *arg); + void *arg; + struct t_pool_job *next; + + struct t_pool *p; + struct t_results_queue *q; + int serial; +} t_pool_job; + +typedef struct t_res { + struct t_res *next; + int serial; // sequential number for ordering + void *data; // result itself +} t_pool_result; + +typedef struct t_pool { + int qsize; // size of queue + int njobs; // pending job count + int nwaiting; // how many workers waiting for new jobs + int shutdown; // true if pool is being destroyed + + // queue of pending jobs + t_pool_job *head, *tail; + + // threads + int tsize; // maximum number of jobs + pthread_t *t; + + // Mutexes + pthread_mutex_t pool_m; // used when updating head/tail + + pthread_cond_t empty_c; + pthread_cond_t pending_c; // not empty + pthread_cond_t full_c; + + // Debugging to check wait time + long long total_time, wait_time; +} t_pool; + +typedef struct t_results_queue { + t_pool_result *result_head; + t_pool_result *result_tail; + int next_serial; + int curr_serial; + int queue_len; // number of items in queue + int pending; // number of pending items (in progress or in pool list) + pthread_mutex_t result_m; + pthread_cond_t result_avail_c; +} t_results_queue; + + +/* + * Creates a worker pool of length qsize with tsize worker threads. + * + * Returns pool pointer on success; + * NULL on failure + */ +t_pool *t_pool_init(int qsize, int tsize); + +/* + * Adds an item to the work pool. + * + * FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs + * result returned. Ie rather than blocking on full queue we're permitted + * to return early on "result available" event too. + * Caller would then have a while loop around t_pool_dispatch. + * Or, return -1 and set errno to E_AGAIN to indicate job not yet submitted. + * + * Returns 0 on success + * -1 on failure + */ +int t_pool_dispatch(t_pool *p, t_results_queue *q, + void *(*func)(void *arg), void *arg); +int t_pool_dispatch2(t_pool *p, t_results_queue *q, + void *(*func)(void *arg), void *arg, int nonblock); + +/* + * Flushes the pool, but doesn't exit. This simply drains the queue and + * ensures all worker threads have finished their current task. + * + * Returns 0 on success; + * -1 on failure + */ +int t_pool_flush(t_pool *p); + +/* + * Destroys a thread pool. If 'kill' is true the threads are terminated now, + * otherwise they are joined into the main thread so they will finish their + * current work load. + * + * Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or + * t_pool_destroy(p,1) to quickly exit after a fatal error. + */ +void t_pool_destroy(t_pool *p, int kill); + +/* + * Pulls a result off the head of the result queue. Caller should + * free it (and any internals as appropriate) after use. This doesn't + * wait for a result to be present. + * + * Results will be returned in strict order. + * + * Returns t_pool_result pointer if a result is ready. + * NULL if not. + */ +t_pool_result *t_pool_next_result(t_results_queue *q); +t_pool_result *t_pool_next_result_wait(t_results_queue *q); + +/* + * Frees a result 'r' and if free_data is true also frees + * the internal r->data result too. + */ +void t_pool_delete_result(t_pool_result *r, int free_data); + +/* + * Initialises a results queue. + * + * Results queue pointer on success; + * NULL on failure + */ +t_results_queue *t_results_queue_init(void); + +/* Deallocates memory for a results queue */ +void t_results_queue_destroy(t_results_queue *q); + +/* + * Returns true if there are no items on the finished results queue and + * also none still pending. + */ +int t_pool_results_queue_empty(t_results_queue *q); + +/* + * Returns the number of completed jobs on the results queue. + */ +int t_pool_results_queue_len(t_results_queue *q); + +/* + * Returns the number of completed jobs plus the number queued up to run. + */ +int t_pool_results_queue_sz(t_results_queue *q); + +#endif /* _THREAD_POOL_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/vlen.c b/star-sys/STAR/source/htslib/cram/vlen.c new file mode 100644 index 0000000..bc7e7d4 --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/vlen.c @@ -0,0 +1,432 @@ +/* +Author: James Bonfield (jkb@sanger.ac.uk) + +Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1 Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2 Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +Copyright (c) 2004, 2009, 2011-2012 Genome Research Ltd. + +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include +#include +#include +#include + +#include "cram/vlen.h" +#include "cram/os.h" + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef ABS +#define ABS(a) ((a)>0?(a):-(a)) +#endif + +/* #define DEBUG_printf(a,n) printf(a,n) */ +#define DEBUG_printf(a,n) + +/* + * vlen: 27/10/95 written by James Bonfield, jkb@mrc-lmb.cam.ac.uk + * + * Given sprintf style of arguments this routine returns the maximum + * size of buffer needed to allocate to use with sprintf. It errs on + * the side of caution by being simplistic in its approach: we assume + * all numbers are of maximum length. + * + * Handles the usual type conversions (%[%diuaxXcfeEgGpns]), but not + * the 'wide' character conversions (%C and %S). + * Precision is handled in the correct formats, including %*.* + * notations. + * Additionally, some of the more dubious (but probably illegal) cases + * are supported (eg "%10%" will expand to " %" on many + * systems). + * + * We also assume that the largest integer and larger pointer are 64 + * bits, which at least covers the machines we'll need it for. + */ +int flen(char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + return vflen(fmt, args); +} + +int vflen(char *fmt, va_list ap) +{ + int len = 0; + char *cp, c; + long long l; + int i; + double d; + + /* + * This code modifies 'ap', but we do not know if va_list is a structure + * or a pointer to an array so we do not know if it is a local variable + * or not. + * C99 gets around this by defining va_copy() to make copies of ap, but + * this does not exist on all systems. + * For now, I just assume that when va_list is a pointer the system also + * provides a va_copy macro to work around this problem. The only system + * I have seen needing this so far was Linux on AMD64. + */ +#if defined(HAVE_VA_COPY) + va_list ap_local; + va_copy(ap_local, ap); +# define ap ap_local +#endif + + for(cp = fmt; *cp; cp++) { + switch(*cp) { + + /* A format specifier */ + case '%': { + char *endp; + long conv_len1=0, conv_len2=0, conv_len=0; + signed int arg_size; + + /* Firstly, strip the modifier flags (+-#0 and [space]) */ + for(; (c=*++cp);) { + if ('#' == c) + len+=2; /* Worst case of "0x" */ + else if ('-' == c || '+' == c || ' ' == c) + len++; + else + break; + } + + /* Width specifier */ + l = strtol(cp, &endp, 10); + if (endp != cp) { + cp = endp; + conv_len = conv_len1 = l; + } else if (*cp == '*') { + conv_len = conv_len1 = (int)va_arg(ap, int); + cp++; + } + + /* Precision specifier */ + if ('.' == *cp) { + cp++; + conv_len2 = strtol(cp, &endp, 10); + if (endp != cp) { + cp = endp; + } else if (*cp == '*') { + conv_len2 = (int)va_arg(ap, int); + cp++; + } + conv_len = MAX(conv_len1, conv_len2); + } + + /* Short/long identifier */ + if ('h' == *cp) { + arg_size = -1; /* short */ + cp++; + } else if ('l' == *cp) { + arg_size = 1; /* long */ + cp++; + if ('l' == *cp) { + arg_size = 2; /* long long */ + cp++; + } + } else { + arg_size = 0; /* int */ + } + + /* The actual type */ + switch (*cp) { + case '%': + /* + * Not real ANSI I suspect, but we'll allow for the + * completely daft "%10%" example. + */ + len += MAX(conv_len1, 1); + break; + + case 'd': + case 'i': + case 'u': + case 'a': + case 'x': + case 'X': + /* Remember: char and short are sent as int on the stack */ + if (arg_size == -1) + l = (long)va_arg(ap, int); + else if (arg_size == 1) + l = va_arg(ap, long); + else if (arg_size == 2) + l = va_arg(ap, long long); + else + l = (long)va_arg(ap, int); + + DEBUG_printf("%d", l); + + /* + * No number can be more than 24 characters so we'll take + * the max of conv_len and 24 (23 is len(2^64) in octal). + * All that work above and we then go and estimate ;-), + * but it's needed incase someone does %500d. + */ + len += MAX(conv_len, 23); + break; + + case 'c': + i = va_arg(ap, int); + DEBUG_printf("%c", i); + /* + * Note that %10c and %.10c act differently. + * Besides, I think precision is not really allowed for %c. + */ + len += MAX(conv_len1, 1); + break; + + case 'f': + d = va_arg(ap, double); + DEBUG_printf("%f", d); + /* + * Maybe "Inf" or "NaN", but we'll not worry about that. + * Again, err on side of caution and take max of conv_len + * and max length of a double. The worst case I can + * think of is 317 characters (-1[308 zeros].000000) + * without using precision codes. That's horrid. I + * cheat and either use 317 or 15 depending on how + * large the number is as I reckon 99% of floats + * aren't that long. + */ + l = (ABS(d) > 1000000) ? 317 : 15; + l = MAX(l, conv_len1 + 2); + if (conv_len2) l += conv_len2 - 6; + len += l; + break; + + case 'e': + case 'E': + case 'g': + case 'G': + d = va_arg(ap, double); + DEBUG_printf("%g", d); + /* + * Maybe "Inf" or "NaN", but we'll not worry about that + * Again, err on side of caution and take max of conv_len + * and max length of a double (which defaults to only + * '-' + 6 + '.' + 'E[+-]xxx' == 13. + */ + len += MAX(conv_len, 13); + break; + + case 'p': + l = (long)va_arg(ap, void *); + /* + * Max pointer is 64bits == 16 chars (on alpha), + * == 20 with + "0x". + */ + DEBUG_printf("%p", (void *)l); + len += MAX(conv_len, 20); + break; + + case 'n': + /* produces no output */ + break; + + case 's': { + char *s = (char *)va_arg(ap, char *); + DEBUG_printf("%s", s); + + if (!conv_len2) { + len += MAX(conv_len, (int)strlen(s)); + } else { + len += conv_len; + } + break; + } + + default: + /* wchar_t types of 'C' and 'S' aren't supported */ + DEBUG_printf("Arg is %c\n", *cp); + } + + } + + case '\0': + break; + + default: + DEBUG_printf("%c", *cp); + len++; + } + } + + va_end(ap); + + return len+1; /* one for the null character */ +} + +#if 0 +int main() { + int l; + char buf[10000]; + + sprintf(buf, "d: %d\n", 500); + l = flen("d: %d\n", 500); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, ""); + l = flen(""); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%s\n","test"); + l = flen("%s\n", "test"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%c\n", 'a'); + l = flen("%c\n", 'a'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%31.30f\n", -9999.99); + l = flen("%31.30f\n", -9999.99); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%f\n", -1e308); + l = flen("%f\n", -1e308); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.9f\n", -1e308); + l = flen("%.9f\n", -1e308); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%10.20f\n", -1.999222333); + l = flen("%10.20f\n", -1.999222333); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%#g\n", -3.14159265358e-222); + l = flen("%#g\n", -3.1415927e-222); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%e\n", -123456789123456789.1); + l = flen("%e\n", -123456789123456789.1); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two"); + l = flen("%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%*.*e %*c\n", 10, 5, 9.0, 20, 'x'); + l = flen("%*.*e %*c\n", 10, 5, 9.0, 20, 'x'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%10c\n", 'z'); + l = flen("%10c\n", 'z'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.10c\n", 'z'); + l = flen("%.10c\n", 'z'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%10d\n", 'z'); + l = flen("%10d\n", 'z'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.10d\n", 'z'); + l = flen("%.10d\n", 'z'); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%10%\n"); + l = flen("%10%\n"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.10%\n"); + l = flen("%.10%\n"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%s\n", "0123456789"); + l = flen("%s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%5s\n", "0123456789"); + l = flen("%5s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%50s\n", "0123456789"); + l = flen("%50s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.5s\n", "0123456789"); + l = flen("%.5s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%.50s\n", "0123456789"); + l = flen("%.50s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%5.50s\n", "0123456789"); + l = flen("%5.50s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + sprintf(buf, "%50.5s\n", "0123456789"); + l = flen("%50.5s\n", "0123456789"); + printf("%d %d\n\n", strlen(buf), l); + + return 0; +} +#endif diff --git a/star-sys/STAR/source/htslib/cram/vlen.h b/star-sys/STAR/source/htslib/cram/vlen.h new file mode 100644 index 0000000..6b9b07c --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/vlen.h @@ -0,0 +1,48 @@ +/* +Author: James Bonfield (jkb@sanger.ac.uk) + +Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1 Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2 Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _VLEN_H_ +#define _VLEN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int vflen(char *fmt, va_list ap); +extern int flen(char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _VLEN_H_ */ diff --git a/star-sys/STAR/source/htslib/cram/zfio.c b/star-sys/STAR/source/htslib/cram/zfio.c new file mode 100644 index 0000000..0a0ae0c --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/zfio.c @@ -0,0 +1,185 @@ +/* +Copyright (c) 2009-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "io_lib_config.h" +#endif + +#include +#include + +#include "cram/os.h" +#include "cram/zfio.h" + +/* ------------------------------------------------------------------------ */ +/* Some wrappers around FILE * vs gzFile *, allowing for either */ + +/* + * gzopen() works on both compressed and uncompressed data, but it has + * a significant performance hit even for uncompressed data (tested as + * 25s using FILE* to 46s via gzOpen and 66s via gzOpen when gzipped). + * + * Hence we use our own wrapper 'zfp' which is a FILE* when uncompressed + * and gzFile* when compressed. This also means we could hide bzopen in + * there too if desired. + */ + +off_t zftello(zfp *zf) { + return zf->fp ? ftello(zf->fp) : -1; +} + +int zfseeko(zfp *zf, off_t offset, int whence) { + return zf->fp ? fseeko(zf->fp, offset, whence) : -1; +} + + +/* + * A wrapper for either fgets or gzgets depending on what has been + * opened. + */ +char *zfgets(char *line, int size, zfp *zf) { + if (zf->fp) + return fgets(line, size, zf->fp); + else + return gzgets(zf->gz, line, size); +} + +/* + * A wrapper for either fputs or gzputs depending on what has been + * opened. + */ +int zfputs(char *line, zfp *zf) { + if (zf->fp) + return fputs(line, zf->fp); + else + return gzputs(zf->gz, line) ? 0 : EOF; +} + +/* + * Peeks at and returns the next character without consuming it from the + * input. (Ie a combination of getc and ungetc). + */ +int zfpeek(zfp *zf) { + int c; + + if (zf->fp) { + c = getc(zf->fp); + if (c != EOF) + ungetc(c, zf->fp); + } else { + c = gzgetc(zf->gz); + if (c != EOF) + gzungetc(c, zf->gz); + } + + return c; +} + +/* A replacement for either feof of gzeof */ +int zfeof(zfp *zf) { + return zf->fp ? feof(zf->fp) : gzeof(zf->gz); +} + +/* A replacement for either fopen or gzopen */ +zfp *zfopen(const char *path, const char *mode) { + char path2[1024]; + zfp *zf; + + if (!(zf = (zfp *)malloc(sizeof(*zf)))) + return NULL; + zf->fp = NULL; + zf->gz = NULL; + + /* Try normal fopen */ + if (mode[0] != 'z' && mode[1] != 'z' && + NULL != (zf->fp = fopen(path, mode))) { + unsigned char magic[2]; + if (2 != fread(magic, 1, 2, zf->fp)) { + free(zf); + return NULL; + } + if (!(magic[0] == 0x1f && + magic[1] == 0x8b)) { + fseeko(zf->fp, 0, SEEK_SET); + return zf; + } + + fclose(zf->fp); + zf->fp = NULL; + } + +#ifdef HAVE_POPEN + /* + * I've no idea why, by gzgets is VERY slow, maybe because it handles + * arbitrary seeks. + * popen to gzip -cd is 3 times faster though. + */ + if (*mode == 'w') { + } else { + if (access(path, R_OK) == 0) { + sprintf(path2, "gzip -cd < %.*s", 1000, path); + if (NULL != (zf->fp = popen(path2, "r"))) + return zf; + } + + sprintf(path2, "gzip -cd < %.*s.gz", 1000, path); + if (NULL != (zf->fp = popen(path2, "r"))) + return zf; + + printf("Failed on %s\n", path); + } else { + sprintf(path2, "gzip > %.*s", 1000, path); + if (NULL != (zf->fp = popen(path2, "w"))) + return zf; + } + + printf("Failed on %s\n", path); + } +#else + /* Gzopen instead */ + if ((zf->gz = gzopen(path, mode))) + return zf; + + sprintf(path2, "%.*s.gz", 1020, path); + if ((zf->gz = gzopen(path2, mode))) + return zf; +#endif + + perror(path); + + free(zf); + return NULL; +} + +int zfclose(zfp *zf) { + int r = (zf->fp) ? fclose(zf->fp) : gzclose(zf->gz); + free(zf); + return r; +} diff --git a/star-sys/STAR/source/htslib/cram/zfio.h b/star-sys/STAR/source/htslib/cram/zfio.h new file mode 100644 index 0000000..2d0580f --- /dev/null +++ b/star-sys/STAR/source/htslib/cram/zfio.h @@ -0,0 +1,54 @@ +/* +Copyright (c) 2009-2013 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _ZFIO_H_ +#define _ZFIO_H_ + +#include +#include + +/* + * Either a gzFile or a FILE. + */ +typedef struct { + FILE *fp; + gzFile gz; +} zfp; + +off_t zftello(zfp *zf); +int zfseeko(zfp *zf, off_t offset, int whence); +char *zfgets(char *line, int size, zfp *zf); +int zfputs(char *line, zfp *zf); +zfp *zfopen(const char *path, const char *mode); +int zfclose(zfp *zf); +int zfpeek(zfp *zf); +int zfeof(zfp *zf); + +#endif /* _ZFIO_H_ */ diff --git a/star-sys/STAR/source/htslib/faidx.5 b/star-sys/STAR/source/htslib/faidx.5 new file mode 100644 index 0000000..9058e51 --- /dev/null +++ b/star-sys/STAR/source/htslib/faidx.5 @@ -0,0 +1,124 @@ +'\" t +.TH faidx 5 "August 2013" "htslib" "Bioinformatics formats" +.SH NAME +faidx \- an index enabling random access to FASTA files +.SH SYNOPSIS +.IR file.fa .fai, +.IR file.fasta .fai +.SH DESCRIPTION +Using an \fBfai index\fP file in conjunction with a FASTA file containing +reference sequences enables efficient access to arbitrary regions within +those reference sequences. +The index file typically has the same filename as the corresponding FASTA +file, with \fB.fai\fP appended. +.P +An \fBfai index\fP file is a text file consisting of lines each with +five TAB-delimited columns: +.TS +lbl. +NAME Name of this reference sequence +LENGTH Total length of this reference sequence, in bases +OFFSET Offset within the FASTA file of this sequence's first base +LINEBASES The number of bases on each line +LINEWIDTH The number of bytes in each line, including the newline +.TE +.P +The \fBNAME\fP and \fBLENGTH\fP columns contain the same +data as would appear in the \fBSN\fP and \fBLN\fP fields of a +SAM \fB@SQ\fP header for the same reference sequence. +.P +The \fBOFFSET\fP column contains the offset within the FASTA file, in bytes +starting from zero, of the first base of this reference sequence, i.e., of +the character following the newline at the end of the "\fB>\fP" header line. +Typically the lines of a \fBfai index\fP file appear in the order in which the +reference sequences appear in the FASTA file, so \fB.fai\fP files are typically +sorted according to this column. +.P +The \fBLINEBASES\fP column contains the number of bases in each of the sequence +lines that form the body of this reference sequence, apart from the final line +which may be shorter. +The \fBLINEWIDTH\fP column contains the number of \fIbytes\fP in each of +the sequence lines (except perhaps the final line), thus differing from +\fBLINEBASES\fP in that it also counts the bytes forming the line terminator. +.SS FASTA Files +In order to be indexed with \fBsamtools faidx\fP, a FASTA file must be a text +file of the form +.LP +.RS +.RI > name +.RI [ description ...] +.br +ATGCATGCATGCATGCATGCATGCATGCAT +.br +GCATGCATGCATGCATGCATGCATGCATGC +.br +ATGCAT +.br +.RI > name +.RI [ description ...] +.br +ATGCATGCATGCAT +.br +GCATGCATGCATGC +.br +[...] +.RE +.LP +In particular, each reference sequence must be "well-formatted", i.e., all +of its sequence lines must be the same length, apart from the final sequence +line which may be shorter. +(While this sequence line length must be the same within each sequence, +it may vary between different reference sequences in the same FASTA file.) +.P +This also means that although the FASTA file may have Unix- or Windows-style +or other line termination, the newline characters present must be consistent, +at least within each reference sequence. +.P +The \fBsamtools\fP implementation uses the first word of the "\fB>\fP" header +line text (i.e., up to the first whitespace character) as the \fBNAME\fP column. +At present, there may be no whitespace between the +">" character and the \fIname\fP. +.SH EXAMPLE +For example, given this FASTA file +.LP +.RS +>one +.br +ATGCATGCATGCATGCATGCATGCATGCAT +.br +GCATGCATGCATGCATGCATGCATGCATGC +.br +ATGCAT +.br +>two another chromosome +.br +ATGCATGCATGCAT +.br +GCATGCATGCATGC +.br +.RE +.LP +formatted with Unix-style (LF) line termination, the corresponding fai index +would be +.RS +.TS +lnnnn. +one 66 5 30 31 +two 28 98 14 15 +.TE +.RE +.LP +If the FASTA file were formatted with Windows-style (CR-LF) line termination, +the fai index would be +.RS +.TS +lnnnn. +one 66 6 30 32 +two 28 103 14 16 +.TE +.RE +.SH SEE ALSO +.IR samtools (1) +.TP +http://en.wikipedia.org/wiki/FASTA_format +Further description of the FASTA format diff --git a/star-sys/STAR/source/htslib/faidx.c b/star-sys/STAR/source/htslib/faidx.c new file mode 100644 index 0000000..05dff83 --- /dev/null +++ b/star-sys/STAR/source/htslib/faidx.c @@ -0,0 +1,421 @@ +#include "config.h" + +#include +#include +#include +#include +#include + +#include "htslib/bgzf.h" +#include "htslib/faidx.h" +#include "htslib/khash.h" +#ifdef _USE_KNETFILE +#include "htslib/knetfile.h" +#endif + +typedef struct { + int32_t line_len, line_blen; + int64_t len; + uint64_t offset; +} faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) + +struct __faidx_t { + BGZF *bgzf; + int n, m; + char **name; + khash_t(s) *hash; +}; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) +{ + khint_t k; + int ret; + faidx1_t t; + if (idx->n == idx->m) { + idx->m = idx->m? idx->m<<1 : 16; + idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m); + } + idx->name[idx->n] = strdup(name); + k = kh_put(s, idx->hash, idx->name[idx->n], &ret); + t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; + kh_value(idx->hash, k) = t; + ++idx->n; +} + +faidx_t *fai_build_core(BGZF *bgzf) +{ + char c, *name; + int l_name, m_name; + int line_len, line_blen, state; + int l1, l2; + faidx_t *idx; + uint64_t offset; + int64_t len; + + idx = (faidx_t*)calloc(1, sizeof(faidx_t)); + idx->hash = kh_init(s); + name = 0; l_name = m_name = 0; + len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; + while ( (c=bgzf_getc(bgzf))>=0 ) { + if (c == '\n') { // an empty line + if (state == 1) { + offset = bgzf_utell(bgzf); + continue; + } else if ((state == 0 && len < 0) || state == 2) continue; + } + if (c == '>') { // fasta header + if (len >= 0) + fai_insert_index(idx, name, len, line_len, line_blen, offset); + l_name = 0; + while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) { + if (m_name < l_name + 2) { + m_name = l_name + 2; + kroundup32(m_name); + name = (char*)realloc(name, m_name); + } + name[l_name++] = c; + } + name[l_name] = '\0'; + if ( c<0 ) { + fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); + free(name); fai_destroy(idx); + return 0; + } + if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); + state = 1; len = 0; + offset = bgzf_utell(bgzf); + } else { + if (state == 3) { + fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 2) state = 3; + l1 = l2 = 0; + do { + ++l1; + if (isgraph(c)) ++l2; + } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); + if (state == 3 && l2) { + fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + ++l1; len += l2; + if (state == 1) line_len = l1, line_blen = l2, state = 0; + else if (state == 0) { + if (l1 != line_len || l2 != line_blen) state = 2; + } + } + } + fai_insert_index(idx, name, len, line_len, line_blen, offset); + free(name); + return idx; +} + +void fai_save(const faidx_t *fai, FILE *fp) +{ + khint_t k; + int i; + for (i = 0; i < fai->n; ++i) { + faidx1_t x; + k = kh_get(s, fai->hash, fai->name[i]); + x = kh_value(fai->hash, k); +#ifdef _WIN32 + fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); +#else + fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); +#endif + } +} + +faidx_t *fai_read(FILE *fp) +{ + faidx_t *fai; + char *buf, *p; + int len, line_len, line_blen; +#ifdef _WIN32 + long offset; +#else + long long offset; +#endif + fai = (faidx_t*)calloc(1, sizeof(faidx_t)); + fai->hash = kh_init(s); + buf = (char*)calloc(0x10000, 1); + while (!feof(fp) && fgets(buf, 0x10000, fp)) { + for (p = buf; *p && isgraph(*p); ++p); + *p = 0; ++p; +#ifdef _WIN32 + sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); +#else + sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); +#endif + fai_insert_index(fai, buf, len, line_len, line_blen, offset); + } + free(buf); + return fai; +} + +void fai_destroy(faidx_t *fai) +{ + int i; + for (i = 0; i < fai->n; ++i) free(fai->name[i]); + free(fai->name); + kh_destroy(s, fai->hash); + if (fai->bgzf) bgzf_close(fai->bgzf); + free(fai); +} + +int fai_build(const char *fn) +{ + char *str; + BGZF *bgzf; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + bgzf = bgzf_open(fn, "r"); + if ( !bgzf ) { + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); + free(str); + return -1; + } + if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf); + fai = fai_build_core(bgzf); + if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); + bgzf_close(bgzf); + fp = fopen(str, "wb"); + if ( !fp ) { + fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); + fai_destroy(fai); free(str); + return -1; + } + fai_save(fai, fp); + fclose(fp); + free(str); + fai_destroy(fai); + return 0; +} + +#ifdef _USE_KNETFILE +FILE *download_and_open(const char *fn) +{ + const int buf_size = 1 * 1024 * 1024; + uint8_t *buf; + FILE *fp; + knetFile *fp_remote; + const char *url = fn; + const char *p; + int l = strlen(fn); + for (p = fn + l - 1; p >= fn; --p) + if (*p == '/') break; + fn = p + 1; + + // First try to open a local copy + fp = fopen(fn, "r"); + if (fp) + return fp; + + // If failed, download from remote and open + fp_remote = knet_open(url, "rb"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); + return NULL; + } + if ((fp = fopen(fn, "wb")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); + knet_close(fp_remote); + return NULL; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); + + return fopen(fn, "r"); +} +#endif + +faidx_t *fai_load(const char *fn) +{ + char *str; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + +#ifdef _USE_KNETFILE + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) + { + fp = download_and_open(str); + if ( !fp ) + { + fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); + free(str); + return 0; + } + } + else +#endif + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] build FASTA index.\n"); + fai_build(fn); + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); + free(str); + return 0; + } + } + + fai = fai_read(fp); + fclose(fp); + + fai->bgzf = bgzf_open(fn, "rb"); + free(str); + if (fai->bgzf == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); + return 0; + } + if ( fai->bgzf->is_compressed==1 ) + { + if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 ) + { + fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn); + fai_destroy(fai); + return NULL; + } + } + return fai; +} + +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + char *s, c; + int i, l, k, name_end; + khiter_t iter; + faidx1_t val; + khash_t(s) *h; + int beg, end; + + beg = end = -1; + h = fai->hash; + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); + if(iter == kh_end(h)) { + fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); + free(s); + *len = -2; + return 0; + }; + val = kh_value(h, iter); + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + end = i < k? atoi(s + i + 1) : val.len; + if (beg > 0) --beg; + } else beg = 0, end = val.len; + if (beg >= val.len) beg = val.len; + if (end >= val.len) end = val.len; + if (beg > end) beg = end; + free(s); + + // now retrieve the sequence + int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); + if ( ret<0 ) + { + *len = -1; + fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); + return NULL; + } + l = 0; + s = (char*)malloc(end - beg + 2); + while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg ) + if (isgraph(c)) s[l++] = c; + s[l] = '\0'; + *len = l; + return s; +} + +int faidx_fetch_nseq(const faidx_t *fai) +{ + return fai->n; +} + +char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int l; + char c; + khiter_t iter; + faidx1_t val; + char *seq=NULL; + + // Adjust position + iter = kh_get(s, fai->hash, c_name); + if (iter == kh_end(fai->hash)) + { + *len = -2; + fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name); + return NULL; + } + val = kh_value(fai->hash, iter); + if(p_end_i < p_beg_i) p_beg_i = p_end_i; + if(p_beg_i < 0) p_beg_i = 0; + else if(val.len <= p_beg_i) p_beg_i = val.len - 1; + if(p_end_i < 0) p_end_i = 0; + else if(val.len <= p_end_i) p_end_i = val.len - 1; + + // Now retrieve the sequence + int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); + if ( ret<0 ) + { + *len = -1; + fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); + return NULL; + } + l = 0; + seq = (char*)malloc(p_end_i - p_beg_i + 2); + while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1) + if (isgraph(c)) seq[l++] = c; + seq[l] = '\0'; + *len = l; + return seq; +} + + diff --git a/star-sys/STAR/source/htslib/hfile.c b/star-sys/STAR/source/htslib/hfile.c new file mode 100644 index 0000000..0188410 --- /dev/null +++ b/star-sys/STAR/source/htslib/hfile.c @@ -0,0 +1,526 @@ +/* hfile.c -- buffered low-level input/output streams. + + Copyright (C) 2013-2014 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include + +#include "htslib/hfile.h" +#include "hfile_internal.h" + +/* hFILE fields are used as follows: + + char *buffer; // Pointer to the start of the I/O buffer + char *begin; // First not-yet-read character / unused position + char *end; // First unfilled/unfillable position + char *limit; // Pointer to the first position past the buffer + + const hFILE_backend *backend; // Methods to refill/flush I/O buffer + + off_t offset; // Offset within the stream of buffer position 0 + int at_eof:1; // For reading, whether EOF has been seen + int has_errno; // Error number from the last failure on this stream + +For reading, begin is the first unread character in the buffer and end is the +first unfilled position: + + -----------ABCDEFGHIJKLMNO--------------- + ^buffer ^begin ^end ^limit + +For writing, begin is the first unused position and end is unused so remains +equal to buffer: + + ABCDEFGHIJKLMNOPQRSTUVWXYZ--------------- + ^buffer ^begin ^limit + ^end + +Thus if begin > end then there is a non-empty write buffer, if begin < end +then there is a non-empty read buffer, and if begin == end then both buffers +are empty. In all cases, the stream's file position indicator corresponds +to the position pointed to by begin. */ + +hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) +{ + hFILE *fp = (hFILE *) malloc(struct_size); + if (fp == NULL) goto error; + + if (capacity == 0) capacity = 32768; + // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory + if (strchr(mode, 'r') && capacity > 32768) capacity = 32768; + + fp->buffer = (char *) malloc(capacity); + if (fp->buffer == NULL) goto error; + + fp->begin = fp->end = fp->buffer; + fp->limit = &fp->buffer[capacity]; + + fp->offset = 0; + fp->at_eof = 0; + fp->has_errno = 0; + return fp; + +error: + hfile_destroy(fp); + return NULL; +} + +void hfile_destroy(hFILE *fp) +{ + int save = errno; + if (fp) free(fp->buffer); + free(fp); + errno = save; +} + +static inline int writebuffer_is_nonempty(hFILE *fp) +{ + return fp->begin > fp->end; +} + +/* Refills the read buffer from the backend (once, so may only partially + fill the buffer), returning the number of additional characters read + (which might be 0), or negative when an error occurred. */ +static ssize_t refill_buffer(hFILE *fp) +{ + ssize_t n; + + // Move any unread characters to the start of the buffer + if (fp->begin > fp->buffer) { + fp->offset += fp->begin - fp->buffer; + memmove(fp->buffer, fp->begin, fp->end - fp->begin); + fp->end = &fp->buffer[fp->end - fp->begin]; + fp->begin = fp->buffer; + } + + // Read into the available buffer space at fp->[end,limit) + if (fp->at_eof || fp->end == fp->limit) n = 0; + else { + n = fp->backend->read(fp, fp->end, fp->limit - fp->end); + if (n < 0) { fp->has_errno = errno; return n; } + else if (n == 0) fp->at_eof = 1; + } + + fp->end += n; + return n; +} + +/* Called only from hgetc(), when our buffer is empty. */ +int hgetc2(hFILE *fp) +{ + return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF; +} + +ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) +{ + size_t n = fp->end - fp->begin; + while (n < nbytes) { + ssize_t ret = refill_buffer(fp); + if (ret < 0) return ret; + else if (ret == 0) break; + else n += ret; + } + + if (n > nbytes) n = nbytes; + memcpy(buffer, fp->begin, n); + return n; +} + +/* Called only from hread(); when called, our buffer is empty and nread bytes + have already been placed in the destination buffer. */ +ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread) +{ + const size_t capacity = fp->limit - fp->buffer; + char *dest = (char *) destv; + dest += nread, nbytes -= nread; + + // Read large requests directly into the destination buffer + while (nbytes * 2 >= capacity && !fp->at_eof) { + ssize_t n = fp->backend->read(fp, dest, nbytes); + if (n < 0) { fp->has_errno = errno; return n; } + else if (n == 0) fp->at_eof = 1; + fp->offset += n; + dest += n, nbytes -= n; + nread += n; + } + + while (nbytes > 0 && !fp->at_eof) { + size_t n; + ssize_t ret = refill_buffer(fp); + if (ret < 0) return ret; + + n = fp->end - fp->begin; + if (n > nbytes) n = nbytes; + memcpy(dest, fp->begin, n); + fp->begin += n; + dest += n, nbytes -= n; + nread += n; + } + + return nread; +} + +/* Flushes the write buffer, fp->[buffer,begin), out through the backend + returning 0 on success or negative if an error occurred. */ +static ssize_t flush_buffer(hFILE *fp) +{ + const char *buffer = fp->buffer; + while (buffer < fp->begin) { + ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer); + if (n < 0) { fp->has_errno = errno; return n; } + buffer += n; + fp->offset += n; + } + + fp->begin = fp->buffer; // Leave the buffer empty + return 0; +} + +int hflush(hFILE *fp) +{ + if (flush_buffer(fp) < 0) return EOF; + if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; } + return 0; +} + +/* Called only from hputc(), when our buffer is already full. */ +int hputc2(int c, hFILE *fp) +{ + if (flush_buffer(fp) < 0) return EOF; + *(fp->begin++) = c; + return c; +} + +/* Called only from hwrite() and hputs2(); when called, our buffer is full and + ncopied bytes from the source have already been copied to our buffer. */ +ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied) +{ + const char *src = (const char *) srcv; + ssize_t ret; + const size_t capacity = fp->limit - fp->buffer; + size_t remaining = totalbytes - ncopied; + src += ncopied; + + ret = flush_buffer(fp); + if (ret < 0) return ret; + + // Write large blocks out directly from the source buffer + while (remaining * 2 >= capacity) { + ssize_t n = fp->backend->write(fp, src, remaining); + if (n < 0) { fp->has_errno = errno; return n; } + fp->offset += n; + src += n, remaining -= n; + } + + // Just buffer any remaining characters + memcpy(fp->begin, src, remaining); + fp->begin += remaining; + + return totalbytes; +} + +/* Called only from hputs(), when our buffer is already full. */ +int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp) +{ + return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF; +} + +off_t hseek(hFILE *fp, off_t offset, int whence) +{ + off_t pos; + + if (writebuffer_is_nonempty(fp)) { + int ret = flush_buffer(fp); + if (ret < 0) return ret; + } + + pos = fp->backend->seek(fp, offset, whence); + if (pos < 0) { fp->has_errno = errno; return pos; } + + // Seeking succeeded, so discard any non-empty read buffer + fp->begin = fp->end = fp->buffer; + fp->at_eof = 0; + + fp->offset = pos; + return pos; +} + +int hclose(hFILE *fp) +{ + int err = fp->has_errno; + + if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno; + if (fp->backend->close(fp) < 0) err = errno; + hfile_destroy(fp); + + if (err) { + errno = err; + return EOF; + } + else return 0; +} + +void hclose_abruptly(hFILE *fp) +{ + int save = errno; + if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ } + hfile_destroy(fp); + errno = save; +} + + +/*************************** + * File descriptor backend * + ***************************/ + +#include +#include +#include +#include + +#ifdef _WIN32 +#define HAVE_CLOSESOCKET +#endif + +/* For Unix, it doesn't matter whether a file descriptor is a socket. + However Windows insists on send()/recv() and its own closesocket() + being used when fd happens to be a socket. */ + +typedef struct { + hFILE base; + int fd; + int is_socket:1; +} hFILE_fd; + +static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes) +{ + hFILE_fd *fp = (hFILE_fd *) fpv; + ssize_t n; + do { + n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0) + : read(fp->fd, buffer, nbytes); + } while (n < 0 && errno == EINTR); + return n; +} + +static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes) +{ + hFILE_fd *fp = (hFILE_fd *) fpv; + ssize_t n; + do { + n = fp->is_socket? send(fp->fd, buffer, nbytes, 0) + : write(fp->fd, buffer, nbytes); + } while (n < 0 && errno == EINTR); + return n; +} + +static off_t fd_seek(hFILE *fpv, off_t offset, int whence) +{ + hFILE_fd *fp = (hFILE_fd *) fpv; + return lseek(fp->fd, offset, whence); +} + +static int fd_flush(hFILE *fpv) +{ + hFILE_fd *fp = (hFILE_fd *) fpv; + int ret; + do { +#ifdef HAVE_FDATASYNC + ret = fdatasync(fp->fd); +#else + ret = fsync(fp->fd); +#endif + // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe, + // and operation-not-supported errors (Mac OS X) + if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0; + } while (ret < 0 && errno == EINTR); + return ret; +} + +static int fd_close(hFILE *fpv) +{ + hFILE_fd *fp = (hFILE_fd *) fpv; + int ret; + do { +#ifdef HAVE_CLOSESOCKET + ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd); +#else + ret = close(fp->fd); +#endif + } while (ret < 0 && errno == EINTR); + return ret; +} + +static const struct hFILE_backend fd_backend = +{ + fd_read, fd_write, fd_seek, fd_flush, fd_close +}; + +static size_t blksize(int fd) +{ + struct stat sbuf; + if (fstat(fd, &sbuf) != 0) return 0; + return sbuf.st_blksize; +} + +static hFILE *hopen_fd(const char *filename, const char *mode) +{ + hFILE_fd *fp = NULL; + int fd = open(filename, hfile_oflags(mode), 0666); + if (fd < 0) goto error; + + fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd)); + if (fp == NULL) goto error; + + fp->fd = fd; + fp->is_socket = 0; + fp->base.backend = &fd_backend; + return &fp->base; + +error: + if (fd >= 0) { int save = errno; (void) close(fd); errno = save; } + hfile_destroy((hFILE *) fp); + return NULL; +} + +hFILE *hdopen(int fd, const char *mode) +{ + hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd)); + if (fp == NULL) return NULL; + + fp->fd = fd; + fp->is_socket = (strchr(mode, 's') != NULL); + fp->base.backend = &fd_backend; + return &fp->base; +} + +static hFILE *hopen_fd_stdinout(const char *mode) +{ + int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO; + // TODO Set binary mode (for Windows) + return hdopen(fd, mode); +} + +int hfile_oflags(const char *mode) +{ + int rdwr = 0, flags = 0; + const char *s; + for (s = mode; *s; s++) + switch (*s) { + case 'r': rdwr = O_RDONLY; break; + case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC; break; + case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND; break; + case '+': rdwr = O_RDWR; break; + default: break; + } + +#ifdef O_BINARY + flags |= O_BINARY; +#endif + + return rdwr | flags; +} + + +/********************* + * In-memory backend * + *********************/ + +typedef struct { + hFILE base; + const char *buffer; + size_t length, pos; +} hFILE_mem; + +static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes) +{ + hFILE_mem *fp = (hFILE_mem *) fpv; + size_t avail = fp->length - fp->pos; + if (nbytes > avail) nbytes = avail; + memcpy(buffer, fp->buffer + fp->pos, nbytes); + fp->pos += nbytes; + return nbytes; +} + +static off_t mem_seek(hFILE *fpv, off_t offset, int whence) +{ + hFILE_mem *fp = (hFILE_mem *) fpv; + size_t absoffset = (offset >= 0)? offset : -offset; + size_t origin; + + switch (whence) { + case SEEK_SET: origin = 0; break; + case SEEK_CUR: origin = fp->pos; break; + case SEEK_END: origin = fp->length; break; + default: errno = EINVAL; return -1; + } + + if ((offset < 0 && absoffset > origin) || + (offset >= 0 && absoffset > fp->length - origin)) { + errno = EINVAL; + return -1; + } + + fp->pos = origin + offset; + return fp->pos; +} + +static int mem_close(hFILE *fpv) +{ + return 0; +} + +static const struct hFILE_backend mem_backend = +{ + mem_read, NULL, mem_seek, NULL, mem_close +}; + +static hFILE *hopen_mem(const char *data, const char *mode) +{ + // TODO Implement write modes, which will require memory allocation + if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; } + + hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0); + if (fp == NULL) return NULL; + + fp->buffer = data; + fp->length = strlen(data); + fp->pos = 0; + fp->base.backend = &mem_backend; + return &fp->base; +} + + +/****************************** + * hopen() backend dispatcher * + ******************************/ + +hFILE *hopen(const char *fname, const char *mode) +{ + if (strncmp(fname, "http://", 7) == 0 || + strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode); + else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode); + else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode); + else return hopen_fd(fname, mode); +} diff --git a/star-sys/STAR/source/htslib/hfile_internal.h b/star-sys/STAR/source/htslib/hfile_internal.h new file mode 100644 index 0000000..88b0c8e --- /dev/null +++ b/star-sys/STAR/source/htslib/hfile_internal.h @@ -0,0 +1,75 @@ +/* hfile_internal.h -- internal parts of low-level input/output streams. + + Copyright (C) 2013-2014 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HFILE_INTERNAL_H +#define HFILE_INTERNAL_H + +#include "htslib/hfile.h" + +struct hFILE_backend { + /* As per read(2), returning the number of bytes read (possibly 0) or + negative (and setting errno) on errors. Front-end code will call this + repeatedly if necessary to attempt to get the desired byte count. */ + ssize_t (*read)(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED; + + /* As per write(2), returning the number of bytes written or negative (and + setting errno) on errors. Front-end code will call this repeatedly if + necessary until the desired block is written or an error occurs. */ + ssize_t (*write)(hFILE *fp, const void *buffer, size_t nbytes) + HTS_RESULT_USED; + + /* As per lseek(2), returning the resulting offset within the stream or + negative (and setting errno) on errors. */ + off_t (*seek)(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED; + + /* Performs low-level flushing, if any, e.g., fsync(2); for writing streams + only. Returns 0 for success or negative (and sets errno) on errors. */ + int (*flush)(hFILE *fp) HTS_RESULT_USED; + + /* Closes the underlying stream (for output streams, the buffer will + already have been flushed), returning 0 for success or negative (and + setting errno) on errors, as per close(2). */ + int (*close)(hFILE *fp) HTS_RESULT_USED; +}; + +/* These are called from the hopen() dispatcher, and should call hfile_init() + to malloc a struct "derived" from hFILE and initialise it appropriately, + including setting base.backend to their own backend vector. */ +hFILE *hopen_net(const char *filename, const char *mode); + +/* May be called by hopen_*() functions to decode a fopen()-style mode into + open(2)-style flags. */ +int hfile_oflags(const char *mode); + +/* Must be called by hopen_*() functions to allocate the hFILE struct and set + up its base. Capacity is a suggested buffer size (e.g., via fstat(2)) + or 0 for a default-sized buffer. */ +hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity); + +/* May be called by hopen_*() functions to undo the effects of hfile_init() + in the event opening the stream subsequently fails. (This is safe to use + even if fp is NULL. This takes care to preserve errno.) */ +void hfile_destroy(hFILE *fp); + +#endif diff --git a/star-sys/STAR/source/htslib/hfile_net.c b/star-sys/STAR/source/htslib/hfile_net.c new file mode 100644 index 0000000..53eda94 --- /dev/null +++ b/star-sys/STAR/source/htslib/hfile_net.c @@ -0,0 +1,99 @@ +/* hfile_net.c -- network backend for low-level input/output streams. + + Copyright (C) 2013-2014 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#include "hfile_internal.h" + +#include "htslib/knetfile.h" + +typedef struct { + hFILE base; + knetFile *netfp; +} hFILE_net; + +static int net_inited = 0; + +#ifdef _WIN32 +static void net_exit(void) +{ + knet_win32_destroy(); +} +#endif + +static int net_init(void) +{ +#ifdef _WIN32 + if (knet_win32_init() != 0) return -1; + + // In the unlikely event atexit() fails, it's better to succeed here and + // carry on and do the I/O; then eventually when the program exits, we'll + // merely have failed to clean up properly, as if we had aborted. + (void) atexit(net_exit); +#endif + + net_inited = 1; + return 0; +} + +static ssize_t net_read(hFILE *fpv, void *buffer, size_t nbytes) +{ + hFILE_net *fp = (hFILE_net *) fpv; + return knet_read(fp->netfp, buffer, nbytes); +} + +static off_t net_seek(hFILE *fpv, off_t offset, int whence) +{ + hFILE_net *fp = (hFILE_net *) fpv; + return knet_seek(fp->netfp, offset, whence); +} + +static int net_close(hFILE *fpv) +{ + hFILE_net *fp = (hFILE_net *) fpv; + return knet_close(fp->netfp); +} + +static const struct hFILE_backend net_backend = +{ + net_read, NULL, net_seek, NULL, net_close +}; + +hFILE *hopen_net(const char *filename, const char *mode) +{ + hFILE_net *fp; + + // Do any networking initialisation if this is the first use. + if (! net_inited) { if (net_init() < 0) return NULL; } + + fp = (hFILE_net *) hfile_init(sizeof (hFILE_net), mode, 0); + if (fp == NULL) return NULL; + + fp->netfp = knet_open(filename, mode); + if (fp->netfp == NULL) { hfile_destroy((hFILE *) fp); return NULL; } + + fp->base.backend = &net_backend; + return &fp->base; +} diff --git a/star-sys/STAR/source/htslib/hts.c b/star-sys/STAR/source/htslib/hts.c new file mode 100644 index 0000000..87f88a9 --- /dev/null +++ b/star-sys/STAR/source/htslib/hts.c @@ -0,0 +1,1326 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/bgzf.h" +#include "htslib/hts.h" +#include "cram/cram.h" +#include "htslib/hfile.h" +#include "version.h" + +#include "htslib/kseq.h" +#define KS_BGZF 1 +#if KS_BGZF + // bgzf now supports gzip-compressed files + KSTREAM_INIT2(, BGZF*, bgzf_read, 65536) +#else + KSTREAM_INIT2(, gzFile, gzread, 16384) +#endif + +#include "htslib/khash.h" +KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) + +int hts_verbose = 3; + +const char *hts_version() +{ + return HTS_VERSION; +} + +const unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; + +/********************** + *** Basic file I/O *** + **********************/ + +// Decompress up to ten or so bytes by peeking at the file, which must be +// positioned at the start of a GZIP block. +static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) +{ + // Typically at most a couple of hundred bytes of input are required + // to get a few bytes of output from inflate(), so hopefully this buffer + // size suffices in general. + unsigned char buffer[512]; + z_stream zs; + ssize_t npeek = hpeek(fp, buffer, sizeof buffer); + + if (npeek < 0) return 0; + + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = buffer; + zs.avail_in = npeek; + zs.next_out = dest; + zs.avail_out = destsize; + if (inflateInit2(&zs, 31) != Z_OK) return 0; + + while (zs.total_out < destsize) + if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; + + destsize = zs.total_out; + inflateEnd(&zs); + + return destsize; +} + +// Returns whether the block contains any control characters, i.e., +// characters less than SPACE other than whitespace etc (ASCII BEL..CR). +static int is_binary(unsigned char *s, size_t n) +{ + size_t i; + for (i = 0; i < n; i++) + if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20)) return 1; + return 0; +} + +htsFile *hts_open(const char *fn, const char *mode) +{ + htsFile *fp = NULL; + hFILE *hfile = hopen(fn, mode); + if (hfile == NULL) goto error; + + fp = (htsFile*)calloc(1, sizeof(htsFile)); + if (fp == NULL) goto error; + + fp->fn = strdup(fn); + fp->is_be = ed_is_big(); + + if (strchr(mode, 'r')) { + unsigned char s[18]; + if (hpeek(hfile, s, 6) == 6 && memcmp(s, "CRAM", 4) == 0 && + s[4] >= 1 && s[4] <= 2 && s[5] <= 1) { + fp->is_cram = 1; + } + else if (hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b && + (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0) { + // The stream is BGZF-compressed. Decompress a few bytes to see + // whether it's in a binary format (e.g., BAM or BCF, starting + // with four bytes of magic including a control character) or is + // a bgzipped SAM or VCF text file. + fp->is_compressed = 1; + if (is_binary(s, decompress_peek(hfile, s, 4))) fp->is_bin = 1; + else fp->is_kstream = 1; + } + else if (hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) { + // Plain GZIP header... so a gzipped text file. + fp->is_compressed = 1; + fp->is_kstream = 1; + } + else if (hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) { + // Binary format, but in a raw non-compressed form. + fp->is_bin = 1; + } + else { + fp->is_kstream = 1; + } + } + else if (strchr(mode, 'w') || strchr(mode, 'a')) { + fp->is_write = 1; + if (strchr(mode, 'b')) fp->is_bin = 1; + if (strchr(mode, 'c')) fp->is_cram = 1; + if (strchr(mode, 'z')) fp->is_compressed = 1; + else if (strchr(mode, 'u')) fp->is_compressed = 0; + else fp->is_compressed = 2; // not set, default behaviour + } + else goto error; + + if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) { + fp->fp.bgzf = bgzf_hopen(hfile, mode); + if (fp->fp.bgzf == NULL) goto error; + } + else if (fp->is_cram) { + fp->fp.cram = cram_dopen(hfile, fn, mode); + if (fp->fp.cram == NULL) goto error; + if (!fp->is_write) + cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); + + } + else if (fp->is_kstream) { + #if KS_BGZF + BGZF *gzfp = bgzf_hopen(hfile, mode); + #else + // TODO Implement gzip hFILE adaptor + hclose(hfile); // This won't work, especially for stdin + gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb"); + #endif + if (gzfp) fp->fp.voidp = ks_init(gzfp); + else goto error; + } + else { + fp->fp.hfile = hfile; + } + + return fp; + +error: + if (hts_verbose >= 2) + fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn); + + if (hfile) + hclose_abruptly(hfile); + + if (fp) { + free(fp->fn); + free(fp->fn_aux); + free(fp); + } + return NULL; +} + +int hts_close(htsFile *fp) +{ + int ret, save; + + if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) { + ret = bgzf_close(fp->fp.bgzf); + } else if (fp->is_cram) { + if (!fp->is_write) { + switch (cram_eof(fp->fp.cram)) { + case 0: + fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__); + return -1; + case 2: + fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__); + break; + default: /* case 1, expected EOF */ + break; + } + } + ret = cram_close(fp->fp.cram); + } else if (fp->is_kstream) { + #if KS_BGZF + BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f; + ret = bgzf_close(gzfp); + #else + gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f; + ret = gzclose(gzfp); + #endif + ks_destroy((kstream_t*)fp->fp.voidp); + } else { + ret = hclose(fp->fp.hfile); + } + + save = errno; + free(fp->fn); + free(fp->fn_aux); + free(fp->line.s); + free(fp); + errno = save; + return ret; +} + +int hts_set_threads(htsFile *fp, int n) +{ + // TODO Plug in CRAM and other threading + if (fp->is_bin) { + return bgzf_mt(fp->fp.bgzf, n, 256); + } + else return 0; +} + +int hts_set_fai_filename(htsFile *fp, const char *fn_aux) +{ + free(fp->fn_aux); + if (fn_aux) { + fp->fn_aux = strdup(fn_aux); + if (fp->fn_aux == NULL) return -1; + } + else fp->fn_aux = NULL; + + return 0; +} + +// For VCF/BCF backward sweeper. Not exposing these functions because their +// future is uncertain. Things will probably have to change with hFILE... +BGZF *hts_get_bgzfp(htsFile *fp) +{ + if ( fp->is_bin ) + return fp->fp.bgzf; + else + return ((kstream_t*)fp->fp.voidp)->f; +} +int hts_useek(htsFile *fp, long uoffset, int where) +{ + if ( fp->is_bin ) + return bgzf_useek(fp->fp.bgzf, uoffset, where); + else + { + ks_rewind((kstream_t*)fp->fp.voidp); + ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset; + return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where); + } +} +long hts_utell(htsFile *fp) +{ + if ( fp->is_bin ) + return bgzf_utell(fp->fp.bgzf); + else + return ((kstream_t*)fp->fp.voidp)->seek_pos; +} + +int hts_getline(htsFile *fp, int delimiter, kstring_t *str) +{ + int ret, dret; + ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret); + ++fp->lineno; + return ret; +} + +char **hts_readlist(const char *string, int is_file, int *_n) +{ + int m = 0, n = 0, dret; + char **s = 0; + if ( is_file ) + { +#if KS_BGZF + BGZF *fp = bgzf_open(string, "r"); +#else + gzFile fp = gzopen(string, "r"); +#endif + if ( !fp ) return NULL; + + kstream_t *ks; + kstring_t str; + str.s = 0; str.l = str.m = 0; + ks = ks_init(fp); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) + { + if (str.l == 0) continue; + n++; + hts_expand(char*,n,m,s); + s[n-1] = strdup(str.s); + } + ks_destroy(ks); +#if KS_BGZF + bgzf_close(fp); +#else + gzclose(fp); +#endif + free(str.s); + } + else + { + const char *q = string, *p = string; + while ( 1 ) + { + if (*p == ',' || *p == 0) + { + n++; + hts_expand(char*,n,m,s); + s[n-1] = (char*)calloc(p - q + 1, 1); + strncpy(s[n-1], q, p - q); + q = p + 1; + } + if ( !*p ) break; + p++; + } + } + s = (char**)realloc(s, n * sizeof(char*)); + *_n = n; + return s; +} + +char **hts_readlines(const char *fn, int *_n) +{ + int m = 0, n = 0, dret; + char **s = 0; +#if KS_BGZF + BGZF *fp = bgzf_open(fn, "r"); +#else + gzFile fp = gzopen(fn, "r"); +#endif + if ( fp ) { // read from file + kstream_t *ks; + kstring_t str; + str.s = 0; str.l = str.m = 0; + ks = ks_init(fp); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { + if (str.l == 0) continue; + if (m == n) { + m = m? m<<1 : 16; + s = (char**)realloc(s, m * sizeof(char*)); + } + s[n++] = strdup(str.s); + } + ks_destroy(ks); + #if KS_BGZF + bgzf_close(fp); + #else + gzclose(fp); + #endif + s = (char**)realloc(s, n * sizeof(char*)); + free(str.s); + } else if (*fn == ':') { // read from string + const char *q, *p; + for (q = p = fn + 1;; ++p) + if (*p == ',' || *p == 0) { + if (m == n) { + m = m? m<<1 : 16; + s = (char**)realloc(s, m * sizeof(char*)); + } + s[n] = (char*)calloc(p - q + 1, 1); + strncpy(s[n++], q, p - q); + q = p + 1; + if (*p == 0) break; + } + } else return 0; + s = (char**)realloc(s, n * sizeof(char*)); + *_n = n; + return s; +} + +int hts_file_type(const char *fname) +{ + int len = strlen(fname); + if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ; + if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF; + if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ; + if ( !strcmp("-",fname) ) return FT_STDIN; + // ... etc + + int fd = open(fname, O_RDONLY); + if ( !fd ) return 0; + + uint8_t magic[5]; + if ( read(fd,magic,2)!=2 ) { close(fd); return 0; } + if ( !strncmp((char*)magic,"##",2) ) { close(fd); return FT_VCF; } + if ( !strncmp((char*)magic,"BCF",3) ) { close(fd); return FT_BCF; } + close(fd); + + if ( magic[0]==0x1f && magic[1]==0x8b ) // compressed + { + BGZF *fp = bgzf_open(fname, "r"); + if ( !fp ) return 0; + if ( bgzf_read(fp, magic, 3)!=3 ) { bgzf_close(fp); return 0; } + bgzf_close(fp); + if ( !strncmp((char*)magic,"##",2) ) return FT_VCF; + if ( !strncmp((char*)magic,"BCF",3) ) return FT_BCF_GZ; + } + return 0; +} + +/**************** + *** Indexing *** + ****************/ + +#define HTS_MIN_MARKER_DIST 0x10000 + +// Finds the special meta bin +// ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1 +#define META_BIN(idx) ((idx)->n_bins + 1) + +#define pair64_lt(a,b) ((a).u < (b).u) + +#include "htslib/ksort.h" +KSORT_INIT(_off, hts_pair64_t, pair64_lt) + +typedef struct { + int32_t m, n; + uint64_t loff; + hts_pair64_t *list; +} bins_t; + +#include "htslib/khash.h" +KHASH_MAP_INIT_INT(bin, bins_t) +typedef khash_t(bin) bidx_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} lidx_t; + +struct __hts_idx_t { + int fmt, min_shift, n_lvls, n_bins; + uint32_t l_meta; + int32_t n, m; + uint64_t n_no_coor; + bidx_t **bidx; + lidx_t *lidx; + uint8_t *meta; + struct { + uint32_t last_bin, save_bin; + int last_coor, last_tid, save_tid, finished; + uint64_t last_off, save_off; + uint64_t off_beg, off_end; + uint64_t n_mapped, n_unmapped; + } z; // keep internal states +}; + +static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + bins_t *l; + int absent; + k = kh_put(bin, b, bin, &absent); + l = &kh_value(b, k); + if (absent) { + l->m = 1; l->n = 0; + l->list = (hts_pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (hts_pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; + l->list[l->n++].v = end; +} + +static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) +{ + int i, beg, end; + beg = _beg >> min_shift; + end = (_end - 1) >> min_shift; + if (l->m < end + 1) { + int old_m = l->m; + l->m = end + 1; + kroundup32(l->m); + l->offset = (uint64_t*)realloc(l->offset, l->m * 8); + memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1 + } + if (beg == end) { // to save a loop in this case + if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset; + } + if (l->n < end + 1) l->n = end + 1; +} + +hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) +{ + hts_idx_t *idx; + idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t)); + if (idx == NULL) return NULL; + idx->fmt = fmt; + idx->min_shift = min_shift; + idx->n_lvls = n_lvls; + idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7; + idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu; + idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0; + idx->z.last_coor = 0xffffffffu; + if (n) { + idx->n = idx->m = n; + idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*)); + if (idx->bidx == NULL) { free(idx); return NULL; } + idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t)); + if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; } + } + return idx; +} + +static void update_loff(hts_idx_t *idx, int i, int free_lidx) +{ + bidx_t *bidx = idx->bidx[i]; + lidx_t *lidx = &idx->lidx[i]; + khint_t k; + int l; + uint64_t offset0 = 0; + if (bidx) { + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + offset0 = kh_val(bidx, k).list[0].u; + for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l) + lidx->offset[l] = offset0; + } else l = 1; + for (; l < lidx->n; ++l) // fill missing values + if (lidx->offset[l] == (uint64_t)-1) + lidx->offset[l] = lidx->offset[l-1]; + if (bidx == 0) return; + for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff + if (kh_exist(bidx, k)) + kh_val(bidx, k).loff = kh_key(bidx, k) < idx->n_bins? lidx->offset[hts_bin_bot(kh_key(bidx, k), idx->n_lvls)] : 0; + if (free_lidx) { + free(lidx->offset); + lidx->m = lidx->n = 0; + lidx->offset = 0; + } +} + +static void compress_binning(hts_idx_t *idx, int i) +{ + bidx_t *bidx = idx->bidx[i]; + khint_t k; + int l, m; + if (bidx == 0) return; + // merge a bin to its parent if the bin is too small + for (l = idx->n_lvls; l > 0; --l) { + unsigned start = hts_bin_first(l); + for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { + bins_t *p, *q; + if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue; + p = &kh_value(bidx, k); + if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list); + if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) { + khint_t kp; + kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k))); + if (kp == kh_end(bidx)) continue; + q = &kh_val(bidx, kp); + if (q->n + p->n > q->m) { + q->m = q->n + p->n; + kroundup32(q->m); + q->list = (hts_pair64_t*)realloc(q->list, q->m * 16); + } + memcpy(q->list + q->n, p->list, p->n * 16); + q->n += p->n; + free(p->list); + kh_del(bin, bidx, k); + } + } + } + k = kh_get(bin, bidx, 0); + if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list); + // merge adjacent chunks that start from the same BGZF block + for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { + bins_t *p; + if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue; + p = &kh_value(bidx, k); + for (l = 1, m = 0; l < p->n; ++l) { + if (p->list[m].v>>16 >= p->list[l].u>>16) { + if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v; + } else p->list[++m] = p->list[l]; + } + p->n = m + 1; + } +} + +void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) +{ + int i; + if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times + if (idx->z.save_tid >= 0) { + insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); + insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); + insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); + } + for (i = 0; i < idx->n; ++i) { + update_loff(idx, i, (idx->fmt == HTS_FMT_CSI)); + compress_binning(idx, i); + } + idx->z.finished = 1; +} + +int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) +{ + int bin; + if (tid >= idx->m) { // enlarge the index + int32_t oldm = idx->m; + idx->m = idx->m? idx->m<<1 : 2; + idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*)); + idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t)); + memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*)); + memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t)); + } + if (idx->n < tid + 1) idx->n = tid + 1; + if (idx->z.finished) return 0; + if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome + if ( tid>=0 && idx->n_no_coor ) + { + if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid); + return -1; + } + if (tid>=0 && idx->bidx[tid] != 0) + { + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__); + return -1; + } + idx->z.last_tid = tid; + idx->z.last_bin = 0xffffffffu; + } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__); + return -1; + } + if ( tid>=0 ) + { + if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); + if ( is_mapped) + insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record + } + else idx->n_no_coor++; + bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); + if ((int)idx->z.last_bin != bin) { // then possibly write the binning index + if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off); + if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information + idx->z.off_end = idx->z.last_off; + insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end); + insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); + idx->z.n_mapped = idx->z.n_unmapped = 0; + idx->z.off_beg = idx->z.off_end; + } + idx->z.save_off = idx->z.last_off; + idx->z.save_bin = idx->z.last_bin = bin; + idx->z.save_tid = tid; + if (tid < 0) { // come to the end of the records having coordinates + hts_idx_finish(idx, offset); + return 0; + } + } + if (is_mapped) ++idx->z.n_mapped; + else ++idx->z.n_unmapped; + idx->z.last_off = offset; + idx->z.last_coor = beg; + return 0; +} + +void hts_idx_destroy(hts_idx_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c + if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; } + + for (i = 0; i < idx->m; ++i) { + bidx_t *bidx = idx->bidx[i]; + free(idx->lidx[i].offset); + if (bidx == 0) continue; + for (k = kh_begin(bidx); k != kh_end(bidx); ++k) + if (kh_exist(bidx, k)) + free(kh_value(bidx, k).list); + kh_destroy(bin, bidx); + } + free(idx->bidx); free(idx->lidx); free(idx->meta); + free(idx); +} + +static inline long idx_read(int is_bgzf, void *fp, void *buf, long l) +{ + if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l); + else return (long)fread(buf, 1, l, (FILE*)fp); +} + +static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l) +{ + if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l); + else return (long)fwrite(buf, 1, l, (FILE*)fp); +} + +static inline void swap_bins(bins_t *p) +{ + int i; + for (i = 0; i < p->n; ++i) { + ed_swap_8p(&p->list[i].u); + ed_swap_8p(&p->list[i].v); + } +} + +static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt) +{ + int32_t i, size, is_be; + int is_bgzf = (fmt != HTS_FMT_BAI); + is_be = ed_is_big(); + if (is_be) { + uint32_t x = idx->n; + idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); + } else idx_write(is_bgzf, fp, &idx->n, 4); + if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta); + for (i = 0; i < idx->n; ++i) { + khint_t k; + bidx_t *bidx = idx->bidx[i]; + lidx_t *lidx = &idx->lidx[i]; + // write binning index + size = bidx? kh_size(bidx) : 0; + if (is_be) { // big endian + uint32_t x = size; + idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); + } else idx_write(is_bgzf, fp, &size, 4); + if (bidx == 0) goto write_lidx; + for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { + bins_t *p; + if (!kh_exist(bidx, k)) continue; + p = &kh_value(bidx, k); + if (is_be) { // big endian + uint32_t x; + x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); + if (fmt == HTS_FMT_CSI) { + uint64_t y = kh_val(bidx, k).loff; + idx_write(is_bgzf, fp, ed_swap_4p(&y), 8); + } + x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); + swap_bins(p); + idx_write(is_bgzf, fp, p->list, 16 * p->n); + swap_bins(p); + } else { + idx_write(is_bgzf, fp, &kh_key(bidx, k), 4); + if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8); + //int j;for(j=0;jn;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v); + idx_write(is_bgzf, fp, &p->n, 4); + idx_write(is_bgzf, fp, p->list, p->n << 4); + } + } +write_lidx: + if (fmt != HTS_FMT_CSI) { + if (is_be) { + int32_t x = lidx->n; + idx_write(is_bgzf, fp, ed_swap_4p(&x), 4); + for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); + idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); + for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]); + } else { + idx_write(is_bgzf, fp, &lidx->n, 4); + idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3); + } + } + } + if (is_be) { // write the number of reads without coordinates + uint64_t x = idx->n_no_coor; + idx_write(is_bgzf, fp, &x, 8); + } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8); +} + +void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) +{ + char *fnidx; + fnidx = (char*)calloc(1, strlen(fn) + 5); + strcpy(fnidx, fn); + if (fmt == HTS_FMT_CSI) { + BGZF *fp; + uint32_t x[3]; + int is_be, i; + is_be = ed_is_big(); + fp = bgzf_open(strcat(fnidx, ".csi"), "w"); + bgzf_write(fp, "CSI\1", 4); + x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta; + if (is_be) { + for (i = 0; i < 3; ++i) + bgzf_write(fp, ed_swap_4p(&x[i]), 4); + } else bgzf_write(fp, &x, 12); + if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta); + hts_idx_save_core(idx, fp, HTS_FMT_CSI); + bgzf_close(fp); + } else if (fmt == HTS_FMT_TBI) { + BGZF *fp; + fp = bgzf_open(strcat(fnidx, ".tbi"), "w"); + bgzf_write(fp, "TBI\1", 4); + hts_idx_save_core(idx, fp, HTS_FMT_TBI); + bgzf_close(fp); + } else if (fmt == HTS_FMT_BAI) { + FILE *fp; + fp = fopen(strcat(fnidx, ".bai"), "w"); + fwrite("BAI\1", 1, 4, fp); + hts_idx_save_core(idx, fp, HTS_FMT_BAI); + fclose(fp); + } else abort(); + free(fnidx); +} + +static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt) +{ + int32_t i, n, is_be; + int is_bgzf = (fmt != HTS_FMT_BAI); + is_be = ed_is_big(); + if (idx == NULL) return -4; + for (i = 0; i < idx->n; ++i) { + bidx_t *h; + lidx_t *l = &idx->lidx[i]; + uint32_t key; + int j, absent; + bins_t *p; + h = idx->bidx[i] = kh_init(bin); + if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1; + if (is_be) ed_swap_4p(&n); + for (j = 0; j < n; ++j) { + khint_t k; + if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1; + if (is_be) ed_swap_4p(&key); + k = kh_put(bin, h, key, &absent); + if (absent <= 0) return -3; // Duplicate bin number + p = &kh_val(h, k); + if (fmt == HTS_FMT_CSI) { + if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1; + if (is_be) ed_swap_8p(&p->loff); + } else p->loff = 0; + if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1; + if (is_be) ed_swap_4p(&p->n); + p->m = p->n; + p->list = (hts_pair64_t*)malloc(p->m * 16); + if (p->list == NULL) return -2; + if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1; + if (is_be) swap_bins(p); + } + if (fmt != HTS_FMT_CSI) { // load linear index + int j; + if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1; + if (is_be) ed_swap_4p(&l->n); + l->m = l->n; + l->offset = (uint64_t*)malloc(l->n << 3); + if (l->offset == NULL) return -2; + if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1; + if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); + for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix + if (l->offset[j] == 0) l->offset[j] = l->offset[j-1]; + update_loff(idx, i, 1); + } + } + if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; + if (is_be) ed_swap_8p(&idx->n_no_coor); + return 0; +} + +hts_idx_t *hts_idx_load_local(const char *fn, int fmt) +{ + uint8_t magic[4]; + int i, is_be; + hts_idx_t *idx = NULL; + is_be = ed_is_big(); + if (fmt == HTS_FMT_CSI) { + BGZF *fp; + uint32_t x[3], n; + uint8_t *meta = 0; + if ((fp = bgzf_open(fn, "r")) == 0) return NULL; + if (bgzf_read(fp, magic, 4) != 4) goto csi_fail; + if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail; + if (bgzf_read(fp, x, 12) != 12) goto csi_fail; + if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]); + if (x[2]) { + if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail; + if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail; + } + if (bgzf_read(fp, &n, 4) != 4) goto csi_fail; + if (is_be) ed_swap_4p(&n); + if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail; + idx->l_meta = x[2]; + idx->meta = meta; + meta = NULL; + if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail; + bgzf_close(fp); + return idx; + + csi_fail: + bgzf_close(fp); + hts_idx_destroy(idx); + free(meta); + return NULL; + + } else if (fmt == HTS_FMT_TBI) { + BGZF *fp; + uint32_t x[8]; + if ((fp = bgzf_open(fn, "r")) == 0) return NULL; + if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail; + if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail; + if (bgzf_read(fp, x, 32) != 32) goto tbi_fail; + if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]); + if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail; + idx->l_meta = 28 + x[7]; + if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail; + memcpy(idx->meta, &x[1], 28); + if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail; + if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail; + bgzf_close(fp); + return idx; + + tbi_fail: + bgzf_close(fp); + hts_idx_destroy(idx); + return NULL; + + } else if (fmt == HTS_FMT_BAI) { + uint32_t n; + FILE *fp; + if ((fp = fopen(fn, "rb")) == 0) return NULL; + if (fread(magic, 1, 4, fp) != 4) goto bai_fail; + if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail; + if (fread(&n, 4, 1, fp) != 1) goto bai_fail; + if (is_be) ed_swap_4p(&n); + idx = hts_idx_init(n, fmt, 0, 14, 5); + if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail; + fclose(fp); + return idx; + + bai_fail: + fclose(fp); + hts_idx_destroy(idx); + return NULL; + + } else abort(); +} + +void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) +{ + if (idx->meta) free(idx->meta); + idx->l_meta = l_meta; + if (is_copy) { + idx->meta = (uint8_t*)malloc(l_meta); + memcpy(idx->meta, meta, l_meta); + } else idx->meta = meta; +} + +uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) +{ + *l_meta = idx->l_meta; + return idx->meta; +} + +const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) +{ + if ( !idx->n ) + { + *n = 0; + return NULL; + } + + int tid = 0, i; + const char **names = (const char**) calloc(idx->n,sizeof(const char*)); + for (i=0; in; i++) + { + bidx_t *bidx = idx->bidx[i]; + if ( !bidx ) continue; + names[tid++] = getid(hdr,i); + } + *n = tid; + return names; +} + +int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) +{ + if ( idx->fmt == HTS_FMT_CRAI ) { + *mapped = 0; *unmapped = 0; + return -1; + } + + bidx_t *h = idx->bidx[tid]; + khint_t k = kh_get(bin, h, META_BIN(idx)); + if (k != kh_end(h)) { + *mapped = kh_val(h, k).list[1].u; + *unmapped = kh_val(h, k).list[1].v; + return 0; + } else { + *mapped = 0; *unmapped = 0; + return -1; + } +} + +uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) +{ + return idx->n_no_coor; +} + +/**************** + *** Iterator *** + ****************/ + +static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls) +{ + int l, t, s = min_shift + (n_lvls<<1) + n_lvls; + if (beg >= end) return 0; + if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; + if (itr->bins.n + n > itr->bins.m) { + itr->bins.m = itr->bins.n + n; + kroundup32(itr->bins.m); + itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); + } + for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; + } + return itr->bins.n; +} + +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +{ + int i, n_off, l, bin; + hts_pair64_t *off; + khint_t k; + bidx_t *bidx; + uint64_t min_off; + hts_itr_t *iter = 0; + if (tid < 0) { + int finished0 = 0; + uint64_t off0 = (uint64_t)-1; + khint_t k; + switch (tid) { + case HTS_IDX_START: + // Find the smallest offset, note that sequence ids may not be ordered sequentially + for (i=0; in; i++) + { + bidx = idx->bidx[i]; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k == kh_end(bidx)) continue; + if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u; + } + if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam + break; + + case HTS_IDX_NOCOOR: + if ( idx->n>0 ) + { + bidx = idx->bidx[idx->n - 1]; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v; + } + if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam + break; + + case HTS_IDX_REST: + off0 = 0; + break; + + case HTS_IDX_NONE: + finished0 = 1; + off0 = 0; + break; + + default: + return 0; + } + if (off0 != (uint64_t)-1) { + iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); + iter->read_rest = 1; + iter->finished = finished0; + iter->curr_off = off0; + iter->readrec = readrec; + return iter; + } else return 0; + } + + if (beg < 0) beg = 0; + if (end < beg) return 0; + if ((bidx = idx->bidx[tid]) == 0) return 0; + + iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + iter->readrec = readrec; + + // compute min_off + bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + do { + int first; + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx)) break; + first = (hts_bin_parent(bin)<<3) + 1; + if (bin > first) --bin; + else bin = hts_bin_parent(bin); + } while (bin); + if (bin == 0) k = kh_get(bin, bidx, bin); + min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + // retrieve bins + reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + for (i = n_off = 0; i < iter->bins.n; ++i) + if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) + n_off += kh_value(bidx, k).n; + if (n_off == 0) return iter; + off = (hts_pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < iter->bins.n; ++i) { + if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { + int j; + bins_t *p = &kh_value(bidx, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + if (n_off == 0) { + free(off); return iter; + } + ks_introsort(_off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else off[++l] = off[i]; + } + n_off = l + 1; + iter->n_off = n_off; iter->off = off; + return iter; +} + +void hts_itr_destroy(hts_itr_t *iter) +{ + if (iter) { free(iter->off); free(iter->bins.a); free(iter); } +} + +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + int i, k, l, name_end; + *beg = *end = -1; + name_end = l = strlen(s); + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + } + // parse the interval + if (name_end < l) { + char *tmp; + tmp = (char*)alloca(l - name_end + 1); + for (i = name_end + 1, k = 0; i < l; ++i) + if (s[i] != ',') tmp[k++] = s[i]; + tmp[k] = 0; + if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0; + *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29; + if (*beg > *end) name_end = l; + } + if (name_end == l) *beg = 0, *end = 1<<29; + return s + name_end; +} + +hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) +{ + int tid, beg, end; + char *q, *tmp; + if (strcmp(reg, ".") == 0) + return itr_query(idx, HTS_IDX_START, 0, 1<<29, readrec); + else if (strcmp(reg, "*") != 0) { + q = (char*)hts_parse_reg(reg, &beg, &end); + tmp = (char*)alloca(q - reg + 1); + strncpy(tmp, reg, q - reg); + tmp[q - reg] = 0; + if ((tid = getid(hdr, tmp)) < 0) + tid = getid(hdr, reg); + if (tid < 0) return 0; + return itr_query(idx, tid, beg, end, readrec); + } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); +} + +int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) +{ + int ret, tid, beg, end; + if (iter == NULL || iter->finished) return -1; + if (iter->read_rest) { + if (iter->curr_off) { // seek to the start + bgzf_seek(fp, iter->curr_off, SEEK_SET); + iter->curr_off = 0; // only seek once + } + ret = iter->readrec(fp, data, r, &tid, &beg, &end); + if (ret < 0) iter->finished = 1; + return ret; + } + if (iter->off == 0) return -1; + for (;;) { + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bgzf_tell(fp); + } + ++iter->i; + } + if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) { + iter->curr_off = bgzf_tell(fp); + if (tid != iter->tid || beg >= iter->end) { // no need to proceed + ret = -1; break; + } else if (end > iter->beg && iter->end > beg) return ret; + } else break; // end of file or error + } + iter->finished = 1; + return ret; +} + +/********************** + *** Retrieve index *** + **********************/ + +static char *test_and_fetch(const char *fn) +{ + FILE *fp; + // FIXME Use is_remote_scheme() helper that's true for ftp/http/irods/etc + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) { + const int buf_size = 1 * 1024 * 1024; + hFILE *fp_remote; + uint8_t *buf; + int l; + const char *p; + for (p = fn + strlen(fn) - 1; p >= fn; --p) + if (*p == '/') break; + ++p; // p now points to the local file name + if ((fp_remote = hopen(fn, "r")) == 0) { + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to open remote file '%s'\n", __func__, fn); + return 0; + } + if ((fp = fopen(p, "w")) == 0) { + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p); + hclose_abruptly(fp_remote); + return 0; + } + if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn); + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn); + return (char*)p; + } else { + if ((fp = fopen(fn, "rb")) == 0) return 0; + fclose(fp); + return (char*)fn; + } +} + +char *hts_idx_getfn(const char *fn, const char *ext) +{ + int i, l_fn, l_ext; + char *fnidx, *ret; + l_fn = strlen(fn); l_ext = strlen(ext); + fnidx = (char*)calloc(l_fn + l_ext + 1, 1); + strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext); + if ((ret = test_and_fetch(fnidx)) == 0) { + for (i = l_fn - 1; i > 0; --i) + if (fnidx[i] == '.') break; + strcpy(fnidx + i, ext); + ret = test_and_fetch(fnidx); + } + if (ret == 0) { + free(fnidx); + return 0; + } + l_fn = strlen(ret); + memmove(fnidx, ret, l_fn + 1); + return fnidx; +} + +hts_idx_t *hts_idx_load(const char *fn, int fmt) +{ + char *fnidx; + hts_idx_t *idx; + fnidx = hts_idx_getfn(fn, ".csi"); + if (fnidx) fmt = HTS_FMT_CSI; + else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi"); + if (fnidx == 0) return 0; + + // Check that the index file is up to date, the main file might have changed + struct stat stat_idx,stat_main; + if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) ) + { + if ( stat_idx.st_mtime < stat_main.st_mtime ) + fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx); + } + idx = hts_idx_load_local(fnidx, fmt); + free(fnidx); + return idx; +} diff --git a/star-sys/STAR/source/htslib/htslib.mk b/star-sys/STAR/source/htslib/htslib.mk new file mode 100644 index 0000000..28d0a8f --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib.mk @@ -0,0 +1,127 @@ +# Makefile rules useful for third-party code using htslib's public API. +# +# Copyright (C) 2013-2014 Genome Research Ltd. +# +# Author: John Marshall + +# The makefile fragment included below provides variables that can be used +# to express dependencies on headers supplied by an in-development htslib. +# If your source file foo.c #includes and , +# you can write the correct prerequisites for foo.o as: +# +# HTSDIR = +# include $(HTSDIR)/htslib.mk +# +# foo.o: foo.c $(htslib_hts_h) $(HTSDIR)/htslib/kstring.h +# +# Variables are not provided for k*.h, as those never include other headers. + +HTSPREFIX = $(HTSDIR)/ +include $(HTSDIR)/htslib_vars.mk + +# Rules for rebuilding an in-development htslib's static and shared libraries. +# If your program foo links with libhts, adding the appropriate prerequisite +# will cause the library to be rebuilt as necessary: +# +# foo: foo.o $(HTSDIR)/libhts.a +# +# or similarly if your target requires any of the tools supplied: +# +# bar.bed.bgz.tbi: bar.bed.bgz $(HTSDIR)/tabix +# $(HTSDIR)/tabix -p bed bar.bed.bgz + +HTSLIB_PUBLIC_HEADERS = \ + $(HTSDIR)/htslib/bgzf.h \ + $(HTSDIR)/htslib/faidx.h \ + $(HTSDIR)/htslib/hfile.h \ + $(HTSDIR)/htslib/hts.h \ + $(HTSDIR)/htslib/hts_defs.h \ + $(HTSDIR)/htslib/khash.h \ + $(HTSDIR)/htslib/klist.h \ + $(HTSDIR)/htslib/knetfile.h \ + $(HTSDIR)/htslib/kseq.h \ + $(HTSDIR)/htslib/ksort.h \ + $(HTSDIR)/htslib/kstdint.h \ + $(HTSDIR)/htslib/kstring.h \ + $(HTSDIR)/htslib/sam.h \ + $(HTSDIR)/htslib/synced_bcf_reader.h \ + $(HTSDIR)/htslib/tbx.h \ + $(HTSDIR)/htslib/vcf.h \ + $(HTSDIR)/htslib/vcf_sweep.h \ + $(HTSDIR)/htslib/vcfutils.h + +HTSLIB_ALL = \ + $(HTSLIB_PUBLIC_HEADERS) \ + $(HTSDIR)/bgzf.c \ + $(HTSDIR)/faidx.c \ + $(HTSDIR)/hfile_internal.h \ + $(HTSDIR)/hfile.c \ + $(HTSDIR)/hfile_net.c \ + $(HTSDIR)/hts.c \ + $(HTSDIR)/knetfile.c \ + $(HTSDIR)/kstring.c \ + $(HTSDIR)/sam.c \ + $(HTSDIR)/synced_bcf_reader.c \ + $(HTSDIR)/tbx.c \ + $(HTSDIR)/vcf.c \ + $(HTSDIR)/vcf_sweep.c \ + $(HTSDIR)/vcfutils.c \ + $(HTSDIR)/cram/cram.h \ + $(HTSDIR)/cram/cram_codecs.c \ + $(HTSDIR)/cram/cram_codecs.h \ + $(HTSDIR)/cram/cram_decode.c \ + $(HTSDIR)/cram/cram_decode.h \ + $(HTSDIR)/cram/cram_encode.c \ + $(HTSDIR)/cram/cram_encode.h \ + $(HTSDIR)/cram/cram_index.c \ + $(HTSDIR)/cram/cram_index.h \ + $(HTSDIR)/cram/cram_io.c \ + $(HTSDIR)/cram/cram_io.h \ + $(HTSDIR)/cram/cram_samtools.c \ + $(HTSDIR)/cram/cram_samtools.h \ + $(HTSDIR)/cram/cram_stats.c \ + $(HTSDIR)/cram/cram_stats.h \ + $(HTSDIR)/cram/cram_structs.h \ + $(HTSDIR)/cram/files.c \ + $(HTSDIR)/cram/mFILE.c \ + $(HTSDIR)/cram/mFILE.h \ + $(HTSDIR)/cram/md5.c \ + $(HTSDIR)/cram/md5.h \ + $(HTSDIR)/cram/misc.h \ + $(HTSDIR)/cram/open_trace_file.c \ + $(HTSDIR)/cram/open_trace_file.h \ + $(HTSDIR)/cram/os.h \ + $(HTSDIR)/cram/pooled_alloc.c \ + $(HTSDIR)/cram/pooled_alloc.h \ + $(HTSDIR)/cram/sam_header.c \ + $(HTSDIR)/cram/sam_header.h \ + $(HTSDIR)/cram/string_alloc.c \ + $(HTSDIR)/cram/string_alloc.h \ + $(HTSDIR)/cram/thread_pool.c \ + $(HTSDIR)/cram/thread_pool.h \ + $(HTSDIR)/cram/vlen.c \ + $(HTSDIR)/cram/vlen.h \ + $(HTSDIR)/cram/zfio.c \ + $(HTSDIR)/cram/zfio.h + +$(HTSDIR)/libhts.a: $(HTSLIB_ALL) + +cd $(HTSDIR) && $(MAKE) lib-static + +$(HTSDIR)/libhts.so $(HTSDIR)/libhts.dylib: $(HTSLIB_ALL) + +cd $(HTSDIR) && $(MAKE) lib-shared + +$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) + +cd $(HTSDIR) && $(MAKE) bgzip + +$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) + +cd $(HTSDIR) && $(MAKE) tabix + +# Rules for phony targets. You may wish to have your corresponding phony +# targets invoke these in addition to their own recipes: +# +# clean: clean-htslib + +clean-htslib install-htslib: + +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) + +.PHONY: clean-htslib install-htslib diff --git a/star-sys/STAR/source/htslib/htslib/bgzf.h b/star-sys/STAR/source/htslib/htslib/bgzf.h new file mode 100644 index 0000000..1edce41 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/bgzf.h @@ -0,0 +1,313 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011, 2012 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* The BGZF library was originally written by Bob Handsaker from the Broad + * Institute. It was later improved by the SAMtools developers. */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include + +#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE +#define BGZF_MAX_BLOCK_SIZE 0x10000 + +#define BGZF_ERR_ZLIB 1 +#define BGZF_ERR_HEADER 2 +#define BGZF_ERR_IO 4 +#define BGZF_ERR_MISUSE 8 + +struct hFILE; +struct bgzf_mtaux_t; +typedef struct __bgzidx_t bgzidx_t; + +struct BGZF { + int errcode:16, is_write:2, is_be:2, compress_level:9, is_compressed:2, is_gzip:1; + int cache_size; + int block_length, block_offset; + int64_t block_address, uncompressed_address; + void *uncompressed_block, *compressed_block; + void *cache; // a pointer to a hash table + struct hFILE *fp; // actual file handle + struct bgzf_mtaux_t *mt; // only used for multi-threading + bgzidx_t *idx; // BGZF index + int idx_build_otf; // build index on the fly, set by bgzf_index_build_init() + z_stream *gz_stream;// for gzip-compressed files +}; +#ifndef HTS_BGZF_TYPEDEF +typedef struct BGZF BGZF; +#define HTS_BGZF_TYPEDEF +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + /****************** + * Basic routines * + ******************/ + + /** + * Open an existing file descriptor for reading or writing. + * + * @param fd file descriptor + * @param mode mode matching /[rwa][u0-9]+/: 'r' for reading, 'w' for + * writing, or 'a' for appending, while a digit specifies + * the zlib compression level. + * Note that there is a distinction between 'u' and '0': the + * first yields plain uncompressed output whereas the latter + * outputs uncompressed data wrapped in the zlib format. + * @return BGZF file handler; 0 on error + */ + BGZF* bgzf_dopen(int fd, const char *mode); + + #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility + + /** + * Open the specified file for reading or writing. + */ + BGZF* bgzf_open(const char* path, const char *mode); + + /** + * Open an existing hFILE stream for reading or writing. + */ + BGZF* bgzf_hopen(struct hFILE *fp, const char *mode); + + /** + * Close the BGZF and free all associated resources. + * + * @param fp BGZF file handler + * @return 0 on success and -1 on error + */ + int bgzf_close(BGZF *fp); + + /** + * Read up to _length_ bytes from the file storing into _data_. + * + * @param fp BGZF file handler + * @param data data array to read into + * @param length size of data to read + * @return number of bytes actually read; 0 on end-of-file and -1 on error + */ + ssize_t bgzf_read(BGZF *fp, void *data, size_t length); + + /** + * Write _length_ bytes from _data_ to the file. If no I/O errors occur, + * the complete _length_ bytes will be written (or queued for writing). + * + * @param fp BGZF file handler + * @param data data array to write + * @param length size of data to write + * @return number of bytes written (i.e., _length_); negative on error + */ + ssize_t bgzf_write(BGZF *fp, const void *data, size_t length); + + /** + * Read up to _length_ bytes directly from the underlying stream without + * decompressing. Bypasses BGZF blocking, so must be used with care in + * specialised circumstances only. + * + * @param fp BGZF file handler + * @param data data array to read into + * @param length number of raw bytes to read + * @return number of bytes actually read; 0 on end-of-file and -1 on error + */ + ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length); + + /** + * Write _length_ bytes directly to the underlying stream without + * compressing. Bypasses BGZF blocking, so must be used with care + * in specialised circumstances only. + * + * @param fp BGZF file handler + * @param data data array to write + * @param length number of raw bytes to write + * @return number of bytes actually written; -1 on error + */ + ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length); + + /** + * Write the data in the buffer to the file. + */ + int bgzf_flush(BGZF *fp); + + /** + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + */ + #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF)) + + /** + * Set the file to read from the location specified by _pos_. + * + * @param fp BGZF file handler + * @param pos virtual file offset returned by bgzf_tell() + * @param whence must be SEEK_SET + * @return 0 on success and -1 on error + */ + int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); + + /** + * Check if the BGZF end-of-file (EOF) marker is present + * + * @param fp BGZF file handler opened for reading + * @return 1 if the EOF marker is present and correct; + * 2 if it can't be checked, e.g., because fp isn't seekable; + * 0 if the EOF marker is absent; + * -1 (with errno set) on error + */ + int bgzf_check_EOF(BGZF *fp); + + /** + * Check if a file is in the BGZF format + * + * @param fn file name + * @return 1 if _fn_ is BGZF; 0 if not or on I/O error + */ + int bgzf_is_bgzf(const char *fn); + + /********************* + * Advanced routines * + *********************/ + + /** + * Set the cache size. Only effective when compiled with -DBGZF_CACHE. + * + * @param fp BGZF file handler + * @param size size of cache in bytes; 0 to disable caching (default) + */ + void bgzf_set_cache_size(BGZF *fp, int size); + + /** + * Flush the file if the remaining buffer size is smaller than _size_ + * @return 0 if flushing succeeded or was not needed; negative on error + */ + int bgzf_flush_try(BGZF *fp, ssize_t size); + + /** + * Read one byte from a BGZF file. It is faster than bgzf_read() + * @param fp BGZF file handler + * @return byte read; -1 on end-of-file or error + */ + int bgzf_getc(BGZF *fp); + + /** + * Read one line from a BGZF file. It is faster than bgzf_getc() + * + * @param fp BGZF file handler + * @param delim delimitor + * @param str string to write to; must be initialized + * @return length of the string; 0 on end-of-file; negative on error + */ + int bgzf_getline(BGZF *fp, int delim, kstring_t *str); + + /** + * Read the next BGZF block. + */ + int bgzf_read_block(BGZF *fp); + + /** + * Enable multi-threading (only effective on writing and when the + * library was compiled with -DBGZF_MT) + * + * @param fp BGZF file handler; must be opened for writing + * @param n_threads #threads used for writing + * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + */ + int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); + + + /******************* + * bgzidx routines * + *******************/ + + /** + * Position BGZF at the uncompressed offset + * + * @param fp BGZF file handler; must be opened for reading + * @param uoffset file offset in the uncompressed data + * @param where SEEK_SET supported atm + * + * Returns 0 on success and -1 on error. + */ + int bgzf_useek(BGZF *fp, long uoffset, int where); + + /** + * Position in uncompressed BGZF + * + * @param fp BGZF file handler; must be opened for reading + * + * Returns the current offset on success and -1 on error. + */ + long bgzf_utell(BGZF *fp); + + /** + * Tell BGZF to build index while compressing. + * + * @param fp BGZF file handler; can be opened for reading or writing. + * + * Returns 0 on success and -1 on error. + */ + int bgzf_index_build_init(BGZF *fp); + + /** + * Load BGZF index + * + * @param fp BGZF file handler + * @param bname base name + * @param suffix suffix to add to bname (can be NULL) + * + * Returns 0 on success and -1 on error. + */ + int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix); + + /** + * Save BGZF index + * + * @param fp BGZF file handler + * @param bname base name + * @param suffix suffix to add to bname (can be NULL) + * + * Returns 0 on success and -1 on error. + */ + int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/faidx.h b/star-sys/STAR/source/htslib/htslib/faidx.h new file mode 100644 index 0000000..3153b98 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/faidx.h @@ -0,0 +1,112 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li , Petr Danecek */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + The fai file index columns are: + - chromosome name + - chromosome length: number of bases + - offset: number of bytes to skip to get to the first base + from the beginning of the file, including the length + of the sequence description string (">chr ..\n") + - line length: number of bases per line (excluding \n) + - binary line length: number of bytes, including \n + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or bgzip-compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region; -2 if seq not present, -1 general error + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + + /*! + @abstract Fetch the number of sequences. + @param fai Pointer to the faidx_t struct + @return The number of sequences + */ + int faidx_fetch_nseq(const faidx_t *fai); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/hfile.h b/star-sys/STAR/source/htslib/htslib/hfile.h new file mode 100644 index 0000000..68175e7 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/hfile.h @@ -0,0 +1,204 @@ +/* hfile.h -- buffered low-level input/output streams. + + Copyright (C) 2013-2014 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_HFILE_H +#define HTSLIB_HFILE_H + +#include + +#include + +#include "hts_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* These fields are declared here solely for the benefit of the inline functions + below. They may change in future releases. User code should not use them + directly; you should imagine that hFILE is an opaque incomplete type. */ +struct hFILE_backend; +typedef struct hFILE { + char *buffer, *begin, *end, *limit; + const struct hFILE_backend *backend; + off_t offset; + int at_eof:1; + int has_errno; +} hFILE; + +/*! + @abstract Open the named file or URL as a stream + @return An hFILE pointer, or NULL (with errno set) if an error occurred. +*/ +hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED; + +/*! + @abstract Associate a stream with an existing open file descriptor + @return An hFILE pointer, or NULL (with errno set) if an error occurred. + @notes For socket descriptors (on Windows), mode should contain 's'. +*/ +hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED; + +/*! + @abstract Flush (for output streams) and close the stream + @return 0 if successful, or EOF (with errno set) if an error occurred. +*/ +int hclose(hFILE *fp) HTS_RESULT_USED; + +/*! + @abstract Close the stream, without flushing or propagating errors + @notes For use while cleaning up after an error only. Preserves errno. +*/ +void hclose_abruptly(hFILE *fp); + +/*! + @abstract Return the stream's error indicator + @return Non-zero (in fact, an errno value) if an error has occurred. + @notes This would be called herror() and return true/false to parallel + ferror(3), but a networking-related herror(3) function already exists. */ +static inline int herrno(hFILE *fp) +{ + return fp->has_errno; +} + +/*! + @abstract Clear the stream's error indicator +*/ +static inline void hclearerr(hFILE *fp) +{ + fp->has_errno = 0; +} + +/*! + @abstract Reposition the read/write stream offset + @return The resulting offset within the stream (as per lseek(2)), + or negative if an error occurred. +*/ +off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED; + +/*! + @abstract Report the current stream offset + @return The offset within the stream, starting from zero. +*/ +static inline off_t htell(hFILE *fp) +{ + return fp->offset + (fp->begin - fp->buffer); +} + +/*! + @abstract Read one character from the stream + @return The character read, or EOF on end-of-file or error +*/ +static inline int hgetc(hFILE *fp) +{ + extern int hgetc2(hFILE *); + return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp); +} + +/*! + @abstract Peek at characters to be read without removing them from buffers + @param fp The file stream + @param buffer The buffer to which the peeked bytes will be written + @param nbytes The number of bytes to peek at; limited by the size of the + internal buffer, which could be as small as 4K. + @return The number of bytes peeked, which may be less than nbytes if EOF + is encountered; or negative, if there was an I/O error. + @notes The characters peeked at remain in the stream's internal buffer, + and will be returned by later hread() etc calls. +*/ +ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED; + +/*! + @abstract Read a block of characters from the file + @return The number of bytes read, or negative if an error occurred. + @notes The full nbytes requested will be returned, except as limited + by EOF or I/O errors. +*/ +static inline ssize_t HTS_RESULT_USED +hread(hFILE *fp, void *buffer, size_t nbytes) +{ + extern ssize_t hread2(hFILE *, void *, size_t, size_t); + + size_t n = fp->end - fp->begin; + if (n > nbytes) n = nbytes; + memcpy(buffer, fp->begin, n); + fp->begin += n; + return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n); +} + +/*! + @abstract Write a character to the stream + @return The character written, or EOF if an error occurred. +*/ +static inline int hputc(int c, hFILE *fp) +{ + extern int hputc2(int, hFILE *); + if (fp->begin < fp->limit) *(fp->begin++) = c; + else c = hputc2(c, fp); + return c; +} + +/*! + @abstract Write a string to the stream + @return 0 if successful, or EOF if an error occurred. +*/ +static inline int hputs(const char *text, hFILE *fp) +{ + extern int hputs2(const char *, size_t, size_t, hFILE *); + + size_t nbytes = strlen(text), n = fp->limit - fp->begin; + if (n > nbytes) n = nbytes; + memcpy(fp->begin, text, n); + fp->begin += n; + return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp); +} + +/*! + @abstract Write a block of characters to the file + @return Either nbytes, or negative if an error occurred. + @notes In the absence of I/O errors, the full nbytes will be written. +*/ +static inline ssize_t HTS_RESULT_USED +hwrite(hFILE *fp, const void *buffer, size_t nbytes) +{ + extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); + + size_t n = fp->limit - fp->begin; + if (n > nbytes) n = nbytes; + memcpy(fp->begin, buffer, n); + fp->begin += n; + return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n); +} + +/*! + @abstract For writing streams, flush buffered output to the underlying stream + @return 0 if successful, or EOF if an error occurred. +*/ +int hflush(hFILE *fp) HTS_RESULT_USED; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/hts.h b/star-sys/STAR/source/htslib/htslib/hts.h new file mode 100644 index 0000000..8c9fcd8 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/hts.h @@ -0,0 +1,305 @@ +#ifndef HTS_H +#define HTS_H + +#include +#include + +#ifndef HTS_BGZF_TYPEDEF +typedef struct BGZF BGZF; +#define HTS_BGZF_TYPEDEF +#endif +struct cram_fd; +struct hFILE; + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +/** + * hts_expand() - expands memory block pointed to by $ptr; + * hts_expand0() the latter sets the newly allocated part to 0. + * + * @param n requested number of elements of type type_t + * @param m size of memory allocated + */ +#define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \ + (m) = (n); kroundup32(m); \ + (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \ + } +#define hts_expand0(type_t, n, m, ptr) if ((n) > (m)) { \ + int t = (m); (m) = (n); kroundup32(m); \ + (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \ + memset(((type_t*)ptr)+t,0,sizeof(type_t)*((m)-t)); \ + } + +/************ + * File I/O * + ************/ + +typedef struct { + uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, is_compressed:2, is_kstream:1, dummy:25; + int64_t lineno; + kstring_t line; + char *fn, *fn_aux; + union { + BGZF *bgzf; + struct cram_fd *cram; + struct hFILE *hfile; + void *voidp; + } fp; +} htsFile; + +/********************** + * Exported functions * + **********************/ + +extern int hts_verbose; + +/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +extern const unsigned char seq_nt16_table[256]; + +/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +extern const char seq_nt16_str[]; + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + @abstract Get the htslib version number + @return For released versions, a string like "N.N[.N]"; or git describe + output if using a library built within a Git repository. +*/ +const char *hts_version(void); + +/*! + @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file + @param fn The file name or "-" for stdin/stdout + @param mode Mode matching /[rwa][bcuz0-9]+/ + @discussion + With 'r' opens for reading; any further format mode letters are ignored + as the format is detected by checking the first few bytes or BGZF blocks + of the file. With 'w' or 'a' opens for writing or appending, with format + specifier letters: + b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) + c CRAM format + u uncompressed + z compressed + [0-9] zlib compression level + Note that there is a distinction between 'u' and '0': the first yields + plain uncompressed output whereas the latter outputs uncompressed data + wrapped in the zlib format. + @example + [rw]b .. compressed BCF, BAM, FAI + [rw]u .. uncompressed BCF + [rw]z .. compressed VCF + [rw] .. uncompressed VCF +*/ +htsFile *hts_open(const char *fn, const char *mode); + +/*! + @abstract Close a file handle, flushing buffered data for output streams + @param fp The file handle to be closed + @return 0 for success, or negative if an error occurred. +*/ +int hts_close(htsFile *fp); + +int hts_getline(htsFile *fp, int delimiter, kstring_t *str); +char **hts_readlines(const char *fn, int *_n); +/*! + @abstract Parse comma-separated list or read list from a file + @param list File name or comma-separated list + @param is_file + @param _n Size of the output array (number of items read) + @return NULL on failure or pointer to newly allocated array of + strings +*/ +char **hts_readlist(const char *fn, int is_file, int *_n); + +/*! + @abstract Create extra threads to aid compress/decompression for this file + @param fp The file handle + @param n The number of worker threads to create + @return 0 for success, or negative if an error occurred. + @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. +*/ +int hts_set_threads(htsFile *fp, int n); + +/*! + @abstract Set .fai filename for a file opened for reading + @return 0 for success, negative on failure + @discussion + Called before *_hdr_read(), this provides the name of a .fai file + used to provide a reference list if the htsFile contains no @SQ headers. +*/ +int hts_set_fai_filename(htsFile *fp, const char *fn_aux); + +#ifdef __cplusplus +} +#endif + +/************ + * Indexing * + ************/ + +/*! +These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc, +producing iterators operating as follows: + - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file + - HTS_IDX_START iterates over the entire file + - HTS_IDX_REST iterates from the current position to the end of the file + - HTS_IDX_NONE always returns "no more alignment records" +When one of these special tid values is used, beg and end are ignored. +When REST or NONE is used, idx is also ignored and may be NULL. +*/ +#define HTS_IDX_NOCOOR (-2) +#define HTS_IDX_START (-3) +#define HTS_IDX_REST (-4) +#define HTS_IDX_NONE (-5) + +#define HTS_FMT_CSI 0 +#define HTS_FMT_BAI 1 +#define HTS_FMT_TBI 2 +#define HTS_FMT_CRAI 3 + +struct __hts_idx_t; +typedef struct __hts_idx_t hts_idx_t; + +typedef struct { + uint64_t u, v; +} hts_pair64_t; + +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); + +typedef struct { + uint32_t read_rest:1, finished:1, dummy:29; + int tid, beg, end, n_off, i; + uint64_t curr_off; + hts_pair64_t *off; + hts_readrec_func *readrec; + struct { + int n, m; + int *a; + } bins; +} hts_itr_t; + +#ifdef __cplusplus +extern "C" { +#endif + + #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) + #define hts_bin_parent(l) (((l) - 1) >> 3) + + hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls); + void hts_idx_destroy(hts_idx_t *idx); + int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); + void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset); + + void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt); + hts_idx_t *hts_idx_load(const char *fn, int fmt); + + uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta); + void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy); + + int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped); + uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); + + const char *hts_parse_reg(const char *s, int *beg, int *end); + hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); + void hts_itr_destroy(hts_itr_t *iter); + + typedef int (*hts_name2id_f)(void*, const char*); + typedef const char *(*hts_id2name_f)(void*, int); + typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); + + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec); + int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data); + const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values + + /** + * hts_file_type() - Convenience function to determine file type + * @fname: the file name + * + * Returns one of the FT_* defines. + * + * This function was added in order to avoid the need for excessive command + * line switches. + */ + #define FT_UNKN 0 + #define FT_GZ 1 + #define FT_VCF 2 + #define FT_VCF_GZ (FT_GZ|FT_VCF) + #define FT_BCF (1<<2) + #define FT_BCF_GZ (FT_GZ|FT_BCF) + #define FT_STDIN (1<<3) + int hts_file_type(const char *fname); + + +#ifdef __cplusplus +} +#endif + +static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +{ + int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; + for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) + if (beg>>s == end>>s) return t + (beg>>s); + return 0; +} + +static inline int hts_bin_bot(int bin, int n_lvls) +{ + int l, b; + for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin + return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; +} + +/************** + * Endianness * + **************/ + +static inline int ed_is_big(void) +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t ed_swap_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *ed_swap_2p(void *x) +{ + *(uint16_t*)x = ed_swap_2(*(uint16_t*)x); + return x; +} +static inline uint32_t ed_swap_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *ed_swap_4p(void *x) +{ + *(uint32_t*)x = ed_swap_4(*(uint32_t*)x); + return x; +} +static inline uint64_t ed_swap_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *ed_swap_8p(void *x) +{ + *(uint64_t*)x = ed_swap_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/hts_defs.h b/star-sys/STAR/source/htslib/htslib/hts_defs.h new file mode 100644 index 0000000..efc4f6c --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/hts_defs.h @@ -0,0 +1,47 @@ +/* hts_defs.h -- Miscellaneous definitions. + + Copyright (C) 2013-2014 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_HTS_DEFS_H +#define HTSLIB_HTS_DEFS_H + +#if __clang__major__ >= 2 || __GNUC__ >= 3 +#define HTS_NORETURN __attribute__ ((__noreturn__)) +#else +#define HTS_NORETURN +#endif + +#if (defined __clang__ && __clang_major__ >= 3) || \ + (defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__==4 && __GNUC_MINOR__ >= 5))) +#define HTS_RESULT_USED __attribute__ ((__warn_unused_result__)) +#else +#define HTS_RESULT_USED +#endif + +#if defined __clang__ || defined __GNUC__ +#define HTS_UNUSED __attribute__ ((__unused__)) +#else +#define HTS_UNUSED +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/kfunc.h b/star-sys/STAR/source/htslib/htslib/kfunc.h new file mode 100644 index 0000000..0a6ad92 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/kfunc.h @@ -0,0 +1,49 @@ +#ifndef __KFUNC_H__ +#define __KFUNC_H__ + +/* Log gamma function + * \log{\Gamma(z)} + * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 + */ +double kf_lgamma(double z); + +/* complementary error function + * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt + * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 + */ +double kf_erfc(double x); + +/* The following computes regularized incomplete gamma functions. + * Formulas are taken from Wiki, with additional input from Numerical + * Recipes in C (for modified Lentz's algorithm) and AS245 + * (http://lib.stat.cmu.edu/apstat/245). + * + * A good online calculator is available at: + * + * http://www.danielsoper.com/statcalc/calc23.aspx + * + * It calculates upper incomplete gamma function, which equals + * kf_gammaq(s,z)*tgamma(s). + */ + +double kf_gammap(double s, double z); +double kf_gammaq(double s, double z); + +/* Regularized incomplete beta function. The method is taken from + * Numerical Recipe in C, 2nd edition, section 6.4. The following web + * page calculates the incomplete beta function, which equals + * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): + * + * http://www.danielsoper.com/statcalc/calc36.aspx + */ +double kf_betai(double a, double b, double x); + +/* + * n11 n12 | n1_ + * n21 n22 | n2_ + * -----------+---- + * n_1 n_2 | n + */ +double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/khash.h b/star-sys/STAR/source/htslib/htslib/khash.h new file mode 100644 index 0000000..2d910de --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/khash.h @@ -0,0 +1,617 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/star-sys/STAR/source/htslib/htslib/khash_str2int.h b/star-sys/STAR/source/htslib/htslib/khash_str2int.h new file mode 100644 index 0000000..db26d72 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/khash_str2int.h @@ -0,0 +1,98 @@ +#ifndef __KHASH_UTILS_H__ +#define __KHASH_UTILS_H__ + +#include + +KHASH_MAP_INIT_STR(str2int, int) + +/* + * Wrappers for khash dictionaries used by mpileup. + */ + +static inline void *khash_str2int_init(void) +{ + return kh_init(str2int); +} + +/* + * Destroy the hash structure, but not the keys + */ +static inline void khash_str2int_destroy(void *_hash) +{ + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + if (hash) kh_destroy(str2int, hash); // Note that strings are not freed. +} + +/* + * Destroys both the hash structure and the keys + */ +static inline void khash_str2int_destroy_free(void *_hash) +{ + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); + kh_destroy(str2int, hash); +} + +/* + * Returns 1 if key exists or 0 if not + */ +static inline int khash_str2int_has_key(void *_hash, const char *str) +{ + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + khint_t k = kh_get(str2int, hash, str); + if ( k == kh_end(hash) ) return 0; + return 1; +} + +/* + * Returns 0 on success and -1 when the key is not present. On success, + * *value is set, unless NULL is passed. + */ +static inline int khash_str2int_get(void *_hash, const char *str, int *value) +{ + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + khint_t k; + if ( !hash ) return -1; + k = kh_get(str2int, hash, str); + if ( k == kh_end(hash) ) return -1; + if ( !value ) return 0; + *value = kh_val(hash, k); + return 0; +} + +/* + * Add a new string to the dictionary, auto-incrementing the value. + * On success returns the newly inserted integer id, on error -1 + * is returned. + */ +static inline int khash_str2int_inc(void *_hash, const char *str) +{ + khint_t k; + int ret; + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + if ( !hash ) return -1; + k = kh_put(str2int, hash, str, &ret); + if (ret == 0) return kh_val(hash, k); + kh_val(hash, k) = kh_size(hash) - 1; + return kh_val(hash, k); +} + +/* + * Set a new key,value pair. On success returns the bin index, on + * error -1 is returned. + */ +static inline int khash_str2int_set(void *_hash, const char *str, int value) +{ + khint_t k; + int ret; + khash_t(str2int) *hash = (khash_t(str2int)*)_hash; + if ( !hash ) return -1; + k = kh_put(str2int, hash, str, &ret); + kh_val(hash,k) = value; + return k; +} + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/klist.h b/star-sys/STAR/source/htslib/htslib/klist.h new file mode 100644 index 0000000..8b33f27 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/klist.h @@ -0,0 +1,121 @@ +/* The MIT License + + Copyright (c) 2008-2009, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef _AC_KLIST_H +#define _AC_KLIST_H + +#include + +#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + } kmp_##name##_t; \ + static inline kmp_##name##_t *kmp_init_##name(void) { \ + return calloc(1, sizeof(kmp_##name##_t)); \ + } \ + static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) { \ + kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ + } \ + free(mp->buf); free(mp); \ + } \ + static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) { \ + mp->max = mp->max? mp->max<<1 : 16; \ + mp->buf = realloc(mp->buf, sizeof(kmptype_t *) * mp->max); \ + } \ + mp->buf[mp->n++] = p; \ + } + +#define kmempool_t(name) kmp_##name##_t +#define kmp_init(name) kmp_init_##name() +#define kmp_destroy(name, mp) kmp_destroy_##name(mp) +#define kmp_alloc(name, mp) kmp_alloc_##name(mp) +#define kmp_free(name, mp, p) kmp_free_##name(mp, p) + +#define KLIST_INIT(name, kltype_t, kmpfree_t) \ + struct __kl1_##name { \ + kltype_t data; \ + struct __kl1_##name *next; \ + }; \ + typedef struct __kl1_##name kl1_##name; \ + KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ + typedef struct { \ + kl1_##name *head, *tail; \ + kmp_##name##_t *mp; \ + size_t size; \ + } kl_##name##_t; \ + static inline kl_##name##_t *kl_init_##name(void) { \ + kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ + kl->mp = kmp_init(name); \ + kl->head = kl->tail = kmp_alloc(name, kl->mp); \ + kl->head->next = 0; \ + return kl; \ + } \ + static inline void kl_destroy_##name(kl_##name##_t *kl) { \ + kl1_##name *p; \ + for (p = kl->head; p != kl->tail; p = p->next) \ + kmp_free(name, kl->mp, p); \ + kmp_free(name, kl->mp, p); \ + kmp_destroy(name, kl->mp); \ + free(kl); \ + } \ + static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ + kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ + q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ + ++kl->size; \ + return &q->data; \ + } \ + static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ + kl1_##name *p; \ + if (kl->head->next == 0) return -1; \ + --kl->size; \ + p = kl->head; kl->head = kl->head->next; \ + if (d) *d = p->data; \ + kmp_free(name, kl->mp, p); \ + return 0; \ + } + +#define kliter_t(name) kl1_##name +#define klist_t(name) kl_##name##_t +#define kl_val(iter) ((iter)->data) +#define kl_next(iter) ((iter)->next) +#define kl_begin(kl) ((kl)->head) +#define kl_end(kl) ((kl)->tail) + +#define kl_init(name) kl_init_##name() +#define kl_destroy(name, kl) kl_destroy_##name(kl) +#define kl_pushp(name, kl) kl_pushp_##name(kl) +#define kl_shift(name, kl, d) kl_shift_##name(kl, d) + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/knetfile.h b/star-sys/STAR/source/htslib/htslib/knetfile.h new file mode 100644 index 0000000..c980258 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/knetfile.h @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + ssize_t knet_read(knetFile *fp, void *buf, size_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, off_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/kseq.h b/star-sys/STAR/source/htslib/htslib/kseq.h new file mode 100644 index 0000000..577cdc4 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/kseq.h @@ -0,0 +1,253 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + int begin, end; \ + int is_eof:2, bufsize:30; \ + uint64_t seek_pos; \ + type_t f; \ + unsigned char *buf; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(SCOPE, type_t, __bufsize) \ + SCOPE kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; ks->bufsize = __bufsize; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + SCOPE void ks_destroy(kstream_t *ks) \ + { \ + if (!ks) return; \ + free(ks->buf); \ + free(ks); \ + } + +#define __KS_INLINED(__read) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + ks->seek_pos++; \ + return (int)ks->buf[ks->begin++]; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(SCOPE, __read) \ + SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + uint64_t seek_pos = str->l; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + ks->seek_pos += seek_pos; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(SCOPE, type_t, __bufsize) \ + __KS_GETUNTIL(SCOPE, __read) \ + __KS_INLINED(__read) + +#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + +#define KSTREAM_DECLARE(type_t, __read) \ + __KS_TYPE(type_t) \ + extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern kstream_t *ks_init(type_t f); \ + extern void ks_destroy(kstream_t *ks); \ + __KS_INLINED(__read) + +/****************** + * FASTA/Q parser * + ******************/ + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/ksort.h b/star-sys/STAR/source/htslib/htslib/ksort.h new file mode 100644 index 0000000..aa0bb93 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/ksort.h @@ -0,0 +1,285 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2012-12-11 (0.1.4): + + * Defined __ks_insertsort_##name as static to compile with C99. + + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } \ + void ks_shuffle_##name(size_t n, type_t a[]) \ + { \ + int i, j; \ + for (i = n; i > 1; --i) { \ + type_t tmp; \ + j = (int)(drand48() * i); \ + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) +#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/kstdint.h b/star-sys/STAR/source/htslib/htslib/kstdint.h new file mode 100644 index 0000000..e948850 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/kstdint.h @@ -0,0 +1,64 @@ +#ifndef KSTDINT_H +#define KSTDINT_H + +#include + +/* Basic assumptions: 1) "char" is 8-bit; 2) there is a 8-bit, 16-bit, 32-bit + * and 64-bit integer type, respectively; 3) "short" is no less than "char", + * "int" is no less than "short", "long" is no less than "int" and "long long" + * is no less than "long"; 4) "int" is at least 16-bit, "long" at least 32-bit + * and "long long" at least 64-bit. The last two assumptions are enforced by + * the C99 spec. + * + * Following assumptions 1) and 2), we know that "signed char"=="int8_t" and + * "short"=="int16_t" for sure. Further from the assumptions, a 32-bit integer + * type must be either "int" or "long". We can test (UINT16_MAX==UINT_MAX) to + * see which is the case. Similarly, a 64-bit integer must be either "long" or + * "long long". We can test (UINT16_MAX==UINT_MAX) to get the definite answer. + */ + +/* 8-bit integers */ +typedef signed char int8_t; +typedef unsigned char uint8_t; +#define INT8_MIN (-SCHAR_MAX-1) +#define INT8_MAX SCHAR_MAX +#define UINT8_MAX UCHAR_MAX + +/* 16-bit integers */ +typedef signed short int16_t; +typedef unsigned short uint16_t; +#define INT16_MIN (-SHRT_MAX-1) +#define INT16_MAX SHRT_MAX +#define UINT16_MAX USHRT_MAX + +/* 32-bit integers */ +#if UINT16_MAX != UINT_MAX +typedef signed int int32_t; +typedef unsigned int uint32_t; +#define INT32_MIN (-INT_MAX-1) +#define INT32_MAX INT_MAX +#define UINT32_MAX UINT_MAX +#else /* then int is 16-bit and long is 32-bit, which may happen to compilers for embedded CPUs */ +typedef signed long int32_t; +typedef unsigned long uint32_t; +#define INT32_MIN (-LONG_MAX-1) +#define INT32_MAX LONG_MAX +#define UINT32_MAX ULONG_MAX +#endif /* ~UINT16_MAX!=UINT_MAX */ + +/* 64-bit integers */ +#if UINT32_MAX != ULONG_MAX +typedef signed long int64_t; +typedef unsigned long uint64_t; +#define INT64_MIN (-LONG_MAX-1) +#define INT64_MAX LONG_MAX +#define UINT64_MAX ULONG_MAX +#else +typedef signed long long int64_t; +typedef unsigned long long uint64_t; +#define INT64_MIN (-LLONG_MAX-1) +#define INT64_MAX LLONG_MAX +#define UINT64_MAX ULLONG_MAX +#endif /* ~UINT32_MAX!=ULONG_MAX */ + +#endif /* ~defined(KSTDINT_H) */ diff --git a/star-sys/STAR/source/htslib/htslib/kstring.h b/star-sys/STAR/source/htslib/htslib/kstring.h new file mode 100644 index 0000000..2567efc --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/kstring.h @@ -0,0 +1,270 @@ +/* The MIT License + + Copyright (c) by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) +#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg))) +#else +#define KS_ATTR_PRINTF(fmt, arg) +#endif + + +/* kstring_t is a simple non-opaque type whose fields are likely to be + * used directly by user code (but see also ks_str() and ks_len() below). + * A kstring_t object is initialised by either of + * kstring_t str = { 0, 0, NULL }; + * kstring_t str; ...; str.l = str.m = 0; str.s = NULL; + * and either ownership of the underlying buffer should be given away before + * the object disappears (see ks_release() below) or the kstring_t should be + * destroyed with free(str.s); */ +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +typedef struct { + uint64_t tab[4]; + int sep, finished; + const char *p; // end of the current token +} ks_tokaux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0); + int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); + + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); + +#ifdef __cplusplus +} +#endif + +static inline int ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + char *tmp; + s->m = size; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return -1; + } + return 0; +} + +static inline char *ks_str(kstring_t *s) +{ + return s->s; +} + +static inline size_t ks_len(kstring_t *s) +{ + return s->l; +} + +// Give ownership of the underlying buffer away to something else (making +// that something else responsible for freeing it), leaving the kstring_t +// empty and ready to be used again, or ready to go out of scope without +// needing free(str.s) to prevent a memory leak. +static inline char *ks_release(kstring_t *s) +{ + char *ss = s->s; + s->l = s->m = 0; + s->s = NULL; + return ss; +} + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + char *tmp; + s->m = s->l + l + 2; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + char *tmp; + s->m = s->l + 2; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputc_(int c, kstring_t *s) +{ + if (s->l + 1 > s->m) { + char *tmp; + s->m = s->l + 1; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + s->s[s->l++] = c; + return 1; +} + +static inline int kputsn_(const void *p, int l, kstring_t *s) +{ + if (s->l + l > s->m) { + char *tmp; + s->m = s->l + l; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + memcpy(s->s + s->l, p, l); + s->l += l; + return l; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int i, l = 0; + unsigned int x = c; + if (c < 0) x = -x; + do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + char *tmp; + s->m = s->l + l + 2; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + char *tmp; + s->m = s->l + l + 2; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + int i, l = 0; + unsigned long x = c; + if (c < 0) x = -x; + do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + char *tmp; + s->m = s->l + l + 2; + kroundup32(s->m); + if ((tmp = (char*)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +/* + * Returns 's' split by delimiter, with *n being the number of components; + * NULL on failue. + */ +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/sam.h b/star-sys/STAR/source/htslib/htslib/sam.h new file mode 100644 index 0000000..e046d01 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/sam.h @@ -0,0 +1,401 @@ +#ifndef BAM_H +#define BAM_H + +#include +#include "hts.h" + +/********************** + *** SAM/BAM header *** + **********************/ + +/*! @typedef + @abstract Structure for the alignment header. + @field n_targets number of reference sequences + @field l_text length of the plain text in the header + @field target_len lengths of the reference sequences + @field target_name names of the reference sequences + @field text plain text + @field sdict header dictionary + */ + +typedef struct { + int32_t n_targets, ignore_sam_err; + uint32_t l_text; + uint32_t *target_len; + int8_t *cigar_tab; + char **target_name; + char *text; + void *sdict; +} bam_hdr_t; + +/**************************** + *** CIGAR related macros *** + ****************************/ + +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 +#define BAM_CEQUAL 7 +#define BAM_CDIFF 8 +#define BAM_CBACK 9 + +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK 0xf +#define BAM_CIGAR_TYPE 0x3C1A7 + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(l, o) ((l)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference + +/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +#define BAM_FPAIRED 1 +/*! @abstract the read is mapped in a proper pair */ +#define BAM_FPROPER_PAIR 2 +/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +#define BAM_FUNMAP 4 +/*! @abstract the mate is unmapped */ +#define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ +#define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ +#define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ +#define BAM_FREAD1 64 +/*! @abstract this is read2 */ +#define BAM_FREAD2 128 +/*! @abstract not primary alignment */ +#define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 +/*! @abstract supplementary alignment */ +#define BAM_FSUPPLEMENTARY 2048 + +/************************* + *** Alignment records *** + *************************/ + +/*! @typedef + @abstract Structure for core alignment information. + @field tid chromosome ID, defined by bam_hdr_t + @field pos 0-based leftmost coordinate + @field bin bin calculated by bam_reg2bin() + @field qual mapping quality + @field l_qname length of the query name + @field flag bitwise flag + @field n_cigar number of CIGAR operations + @field l_qseq length of the query sequence (read) + @field mtid chromosome ID of next read in template, defined by bam_hdr_t + @field mpos 0-based leftmost coordinate of next read in template + */ +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +/*! @typedef + @abstract Structure for one alignment. + @field core core information about the alignment + @field l_data current length of bam1_t::data + @field m_data maximum length of bam1_t::data + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux + + @discussion Notes: + + 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 2. l_qseq is calculated from the total length of an alignment block + on reading or from CIGAR. + 3. cigar data is encoded 4 bytes per CIGAR operation. + 4. seq is nybble-encoded according to bam_nt16_table. + */ +typedef struct { + bam1_core_t core; + int l_data, m_data; + uint8_t *data; +#ifndef BAM_NO_ID + uint64_t id; +#endif +} bam1_t; + +/*! @function + @abstract Get whether the query is on the reverse strand + @param b pointer to an alignment + @return boolean true if query is on the reverse strand + */ +#define bam_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0) +/*! @function + @abstract Get whether the query's mate is on the reverse strand + @param b pointer to an alignment + @return boolean true if query's mate on the reverse strand + */ +#define bam_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0) +/*! @function + @abstract Get the name of the query + @param b pointer to an alignment + @return pointer to the name string, null terminated + */ +#define bam_get_qname(b) ((char*)(b)->data) +/*! @function + @abstract Get the CIGAR array + @param b pointer to an alignment + @return pointer to the CIGAR array + + @discussion In the CIGAR array, each element is a 32-bit integer. The + lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + length of a CIGAR. + */ +#define bam_get_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) +/*! @function + @abstract Get query sequence + @param b pointer to an alignment + @return pointer to sequence + + @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + 8 for T and 15 for N. Two bases are packed in one byte with the base + at the higher 4 bits having smaller coordinate on the read. It is + recommended to use bam_seqi() macro to get the base. + */ +#define bam_get_seq(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname) +/*! @function + @abstract Get query quality + @param b pointer to an alignment + @return pointer to quality string + */ +#define bam_get_qual(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) +/*! @function + @abstract Get auxiliary data + @param b pointer to an alignment + @return pointer to the concatenated auxiliary data + */ +#define bam_get_aux(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq) +/*! @function + @abstract Get length of auxiliary data + @param b pointer to an alignment + @return length of the concatenated auxiliary data + */ +#define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1)) +/*! @function + @abstract Get a base on read + @param s Query sequence returned by bam1_seq() + @param i The i-th position, 0-based + @return 4-bit integer representing the base. + */ +#define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +/************************** + *** Exported functions *** + **************************/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*************** + *** BAM I/O *** + ***************/ + + bam_hdr_t *bam_hdr_init(void); + bam_hdr_t *bam_hdr_read(BGZF *fp); + int bam_hdr_write(BGZF *fp, const bam_hdr_t *h); + void bam_hdr_destroy(bam_hdr_t *h); + int bam_name2id(bam_hdr_t *h, const char *ref); + bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); + + bam1_t *bam_init1(void); + void bam_destroy1(bam1_t *b); + int bam_read1(BGZF *fp, bam1_t *b); + int bam_write1(BGZF *fp, const bam1_t *b); + bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc); + bam1_t *bam_dup1(const bam1_t *bsrc); + + int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); + int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); + + /*! + @abstract Calculate the rightmost base position of an alignment on the + reference genome. + + @param b pointer to an alignment + @return the coordinate of the first base after the alignment, 0-based + + @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. + For an unmapped read (either according to its flags or if it has no cigar + string), we return b->core.pos + 1 by convention. + */ + int32_t bam_endpos(const bam1_t *b); + + int bam_str2flag(const char *str); /** returns negative value on error */ + char *bam_flag2str(int flag); /** The string must be freed by the user */ + + /************************* + *** BAM/CRAM indexing *** + *************************/ + + // These BAM iterator functions work only on BAM files. To work with either + // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. + #define bam_itr_destroy(iter) hts_itr_destroy(iter) + #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) + #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) + #define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) + + // Load .csi or .bai BAM index file. + #define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI) + + int bam_index_build(const char *fn, int min_shift); + + // Load BAM (.csi or .bai) or CRAM (.crai) index file. + hts_idx_t *sam_index_load(htsFile *fp, const char *fn); + + #define sam_itr_destroy(iter) hts_itr_destroy(iter) + hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); + hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region); + #define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp)) + + /*************** + *** SAM I/O *** + ***************/ + + #define sam_open(fn, mode) (hts_open((fn), (mode))) + #define sam_close(fp) hts_close(fp) + + int sam_open_mode(char *mode, const char *fn, const char *format); + + typedef htsFile samFile; + bam_hdr_t *sam_hdr_parse(int l_text, const char *text); + bam_hdr_t *sam_hdr_read(samFile *fp); + int sam_hdr_write(samFile *fp, const bam_hdr_t *h); + + int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b); + int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); + int sam_read1(samFile *fp, bam_hdr_t *h, bam1_t *b); + int sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b); + + /************************************* + *** Manipulating auxiliary fields *** + *************************************/ + + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + int32_t bam_aux2i(const uint8_t *s); + double bam_aux2f(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + int bam_aux_del(bam1_t *b, uint8_t *s); + +#ifdef __cplusplus +} +#endif + +/************************** + *** Pileup and Mpileup *** + **************************/ + +#if !defined(BAM_NO_PILEUP) + +/*! @typedef + @abstract Structure for one alignment covering the pileup position. + @field b pointer to the alignment + @field qpos position of the read base at the pileup site, 0-based + @field indel indel length; 0 for no indel, positive for ins and negative for del + @field level the level of the read in the "viewer" mode + @field is_del 1 iff the base on the padded read is a deletion + @field is_head ??? + @field is_tail ??? + @field is_refskip ??? + @field aux ??? + + @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + difference between the two functions is that the former does not + set bam_pileup1_t::level, while the later does. Level helps the + implementation of alignment viewers, but calculating this has some + overhead. + */ +typedef struct { + bam1_t *b; + int32_t qpos; + int indel, level; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; +} bam_pileup1_t; + +typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); + +struct __bam_plp_t; +typedef struct __bam_plp_t *bam_plp_t; + +struct __bam_mplp_t; +typedef struct __bam_mplp_t *bam_mplp_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * bam_plp_init() - sets an iterator over multiple + * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return + * status: 0 on success, -1 on end, < -1 on non-recoverable errors + * @data: user data to pass to @func + */ + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + void bam_plp_destroy(bam_plp_t iter); + int bam_plp_push(bam_plp_t iter, const bam1_t *b); + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); + void bam_plp_reset(bam_plp_t iter); + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + /** + * bam_mplp_init_overlaps() - if called, mpileup will detect overlapping + * read pairs and for each base pair set the base quality of the + * lower-quality base to zero, thus effectively discarding it from + * calling. If the two bases are identical, the quality of the other base + * is increased to the sum of their qualities (capped at 200), otherwise + * it is multiplied by 0.8. + */ + void bam_mplp_init_overlaps(bam_mplp_t iter); + void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); + +#ifdef __cplusplus +} +#endif + +#endif // ~!defined(BAM_NO_PILEUP) + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/synced_bcf_reader.h b/star-sys/STAR/source/htslib/htslib/synced_bcf_reader.h new file mode 100644 index 0000000..2f79a21 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/synced_bcf_reader.h @@ -0,0 +1,255 @@ +/* + The synced_bcf_reader allows to keep multiple VCFs open and stream them + using the next_line iterator in a seamless matter without worrying about + chromosomes and synchronizing the sites. This is used by vcfcheck to + compare multiple VCFs simultaneously and is used also for merging, + creating intersections, etc. + + The synced_bcf_reader also provides API for reading indexed BCF/VCF, + hiding differences in BCF/VCF opening, indexing and reading. + + + Example of usage: + + bcf_srs_t *sr = bcf_sr_init(); + for (i=0; ihas_line[i] +#define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL) +#define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0) + +/** + * bcf_sr_seek() - set all readers to selected position + * @seq: sequence name; NULL to seek to start + * @pos: 0-based coordinate + */ +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); + +/** + * bcf_sr_set_samples() - sets active samples + * @readers: holder of the open readers + * @samples: this can be one of: file name with one sample per line; + * or column-separated list of samples; or '-' for a list of + * samples shared by all files. If first character is the + * exclamation mark, all but the listed samples are included. + * @is_file: 0: list of samples; 1: file with sample names + * + * Returns 1 if the call succeeded, or 0 on error. + */ +int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); + +/** + * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions + * @readers: holder of the open readers + * @targets: list of regions, one-based and inclusive. + * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to) + * 1: targets is a tabix indexed file with a list of regions + * ( or ) + * + * Returns 0 if the call succeeded, or -1 on error. + * + * Both functions behave the same way, unlisted positions will be skipped by + * bcf_sr_next_line(). However, there is an important difference: regions use + * index to jump to desired positions while targets streams the whole files + * and merely skip unlisted positions. + * + * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which + * is intepreted as a 1-based column index in the tab-delimited file where + * alleles are listed. This in principle enables to perform the COLLAPSE_* + * logic also with tab-delimited files. However, the current implementation + * considers the alleles merely as a suggestion for prioritizing one of possibly + * duplicate VCF lines. It is up to the caller to examine targets->als if + * perfect match is sought after. Note that the duplicate positions in targets + * file are currently not supported. + */ +int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); +int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); + + + +/* + * bcf_sr_regions_init() + * @regions: regions can be either a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or + * tab-delimited file (the default). Uncompressed files + * are stored in memory while bgzip-compressed and tabix-indexed + * region files are streamed. + * @is_file: 0: regions is a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) + * 1: VCF, BED or tab-delimited file + * @chr, from, to: + * Column indexes of chromosome, start position and end position + * in the tab-delimited file. The positions are 1-based and + * inclusive. + * These parameters are ignored when reading from VCF, BED or + * tabix-indexed files. When end position column is not present, + * supply 'from' in place of 'to'. When 'to' is negative, first + * abs(to) will be attempted and if that fails, 'from' will be used + * instead. + */ +bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to); +void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); + +/* + * bcf_sr_regions_seek() - seek to the chromosome block + * + * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and + * reg->start,reg->end to -1. + */ +int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); + +/* + * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1 + * when all regions have been read. The fields reg->seq, reg->start and + * reg->end are filled with the genomic coordinates on succes or with + * NULL,-1,-1 when no region is available. The coordinates are 0-based, + * inclusive. + */ +int bcf_sr_regions_next(bcf_sr_regions_t *reg); + +/* + * bcf_sr_regions_overlap() - checks if the interval overlaps any of + * the regions, the coordinates are 0-based, inclusive. The coordinate queries + * must come in ascending order. + * + * Returns 0 if the position is in regions; -1 if the position is not in the + * regions and more regions exist; -2 if not in the regions and there are no more + * regions left. + */ +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end); + +/* + * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until + * all remaining records are processed. + */ +void bcf_sr_regions_flush(bcf_sr_regions_t *regs); + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/tbx.h b/star-sys/STAR/source/htslib/htslib/tbx.h new file mode 100644 index 0000000..71b4ac2 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/tbx.h @@ -0,0 +1,52 @@ +#ifndef TBX_H +#define TBX_H + +#include "hts.h" + +#define TBX_MAX_SHIFT 31 + +#define TBX_GENERIC 0 +#define TBX_SAM 1 +#define TBX_VCF 2 +#define TBX_UCSC 0x10000 + +typedef struct { + int32_t preset; + int32_t sc, bc, ec; // seq col., beg col. and end col. + int32_t meta_char, line_skip; +} tbx_conf_t; + +typedef struct { + tbx_conf_t conf; + hts_idx_t *idx; + void *dict; +} tbx_t; + +extern tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf; + +#ifdef __cplusplus +extern "C" { +#endif + + #define tbx_itr_destroy(iter) hts_itr_destroy(iter) + #define tbx_itr_queryi(tbx, tid, beg, end) hts_itr_query((tbx)->idx, (tid), (beg), (end), tbx_readrec) + #define tbx_itr_querys(tbx, s) hts_itr_querys((tbx)->idx, (s), (hts_name2id_f)(tbx_name2id), (tbx), hts_itr_query, tbx_readrec) + #define tbx_itr_next(htsfp, tbx, itr, r) hts_itr_next(hts_get_bgzfp(htsfp), (itr), (r), (tbx)) + #define tbx_bgzf_itr_next(bgzfp, tbx, itr, r) hts_itr_next((bgzfp), (itr), (r), (tbx)) + + int tbx_name2id(tbx_t *tbx, const char *ss); + + /* Internal helper function used by tbx_itr_next() */ + BGZF *hts_get_bgzfp(htsFile *fp); + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); + + int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf); + tbx_t *tbx_index_load(const char *fn); + const char **tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values + void tbx_destroy(tbx_t *tbx); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/vcf.h b/star-sys/STAR/source/htslib/htslib/vcf.h new file mode 100644 index 0000000..ef95ee0 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/vcf.h @@ -0,0 +1,824 @@ +/* + todo: + - make the function names consistent + - provide calls to abstract away structs as much as possible + */ + +#ifndef BCF_H +#define BCF_H + +#include +#include +#include +#include "hts.h" +#include "kstring.h" + + +/***************** + * Header struct * + *****************/ + +#define BCF_HL_FLT 0 // header line +#define BCF_HL_INFO 1 +#define BCF_HL_FMT 2 +#define BCF_HL_CTG 3 +#define BCF_HL_STR 4 // structured header line TAG= +#define BCF_HL_GEN 5 // generic header line + +#define BCF_HT_FLAG 0 // header type +#define BCF_HT_INT 1 +#define BCF_HT_REAL 2 +#define BCF_HT_STR 3 + +#define BCF_VL_FIXED 0 // variable length +#define BCF_VL_VAR 1 +#define BCF_VL_A 2 +#define BCF_VL_G 3 +#define BCF_VL_R 4 + +/* === Dictionary === + + The header keeps three dictonaries. The first keeps IDs in the + "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths + in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] + is the actual hash table, which is opaque to the end users. In the hash + table, the key is the ID or sample name as a C string and the value is a + bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash + table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the + size of the hash table or, equivalently, the length of the id[] arrays. +*/ + +#define BCF_DT_ID 0 // dictionary type +#define BCF_DT_CTG 1 +#define BCF_DT_SAMPLE 2 + +// Complete textual representation of a header line +typedef struct { + int type; // One of the BCF_HL_* type + char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. + char *value; // Set only for generic lines, NULL for FILTER/INFO, etc. + int nkeys; // Number of structured fields + char **keys, **vals; // The key=value pairs +} bcf_hrec_t; + +typedef struct { + uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 for BCF_HL_FLT,INFO,FMT + bcf_hrec_t *hrec[3]; + int id; +} bcf_idinfo_t; + +typedef struct { + const char *key; + const bcf_idinfo_t *val; +} bcf_idpair_t; + +typedef struct { + int32_t n[3]; + bcf_idpair_t *id[3]; + void *dict[3]; // ID dictionary, contig dict and sample dict + char **samples; + bcf_hrec_t **hrec; + int nhrec; + int ntransl, *transl[2]; // for bcf_translate() + int nsamples_ori; // for bcf_hdr_set_samples() + uint8_t *keep_samples; + kstring_t mem; +} bcf_hdr_t; + +extern uint8_t bcf_type_shift[]; + +/************** + * VCF record * + **************/ + +#define BCF_BT_NULL 0 +#define BCF_BT_INT8 1 +#define BCF_BT_INT16 2 +#define BCF_BT_INT32 3 +#define BCF_BT_FLOAT 5 +#define BCF_BT_CHAR 7 + +#define VCF_REF 0 +#define VCF_SNP 1 +#define VCF_MNP 2 +#define VCF_INDEL 4 +#define VCF_OTHER 8 + +typedef struct { + int type, n; // variant type and the number of bases affected, negative for deletions +} variant_t; + +typedef struct { + int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key + int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types + uint8_t *p; // same as vptr and vptr_* in bcf_info_t below + uint32_t p_len; + uint32_t p_off:31, p_free:1; +} bcf_fmt_t; + +typedef struct { + int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key + int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars + union { + int32_t i; // integer value + float f; // float value + } v1; // only set if $len==1; for easier access + uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes + uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset + uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes + vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new + // data block is bigger than the original +} bcf_info_t; + + +#define BCF1_DIRTY_ID 1 +#define BCF1_DIRTY_ALS 2 +#define BCF1_DIRTY_FLT 4 +#define BCF1_DIRTY_INF 8 + +typedef struct { + int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change + int n_flt; // Number of FILTER fields + int *flt; // FILTER keys in the dictionary + char *id, *als; // ID and REF+ALT block (\0-seperated) + char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated + bcf_info_t *info; // INFO + bcf_fmt_t *fmt; // FORMAT and individual sample + variant_t *var; // $var and $var_type set only when set_variant_types called + int n_var, var_type; + int shared_dirty; // if set, shared.s must be recreated on BCF output + int indiv_dirty; // if set, indiv.s must be recreated on BCF output +} bcf_dec_t; + + +#define BCF_ERR_CTG_UNDEF 1 +#define BCF_ERR_TAG_UNDEF 2 +#define BCF_ERR_NCOLS 4 + +/* + The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file + is slower because the string is first to be parsed, packed into BCF line + (done in vcf_parse), then unpacked into internal bcf1_t structure. If it + is known in advance that some of the fields will not be required (notably + the sample columns), parsing of these can be skipped by setting max_unpack + appropriately. + Similarly, it is fast to output a BCF line because the columns (kept in + shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF + line must be formatted in vcf_format. + */ +typedef struct { + int32_t rid; // CHROM + int32_t pos; // POS + int32_t rlen; // length of REF + float qual; // QUAL + uint32_t n_info:16, n_allele:16; + uint32_t n_fmt:8, n_sample:24; + kstring_t shared, indiv; + bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() + int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed + int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work + uint8_t *unpack_ptr; // position of the last unpack call + int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER + int errcode; // one of BCF_ERR_* codes +} bcf1_t; + +/******* + * API * + *******/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*********************************************************************** + * BCF and VCF I/O + * + * A note about naming conventions: htslib internally represents VCF + * records as bcf1_t data structures, therefore most functions are + * prefixed with bcf_. There are a few exceptions where the functions must + * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In + * these cases, functions prefixed with bcf_ are more general and work + * with both BCF and VCF. + * + ***********************************************************************/ + + /** These macros are defined only for consistency with other parts of htslib */ + #define bcf_init1() bcf_init() + #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v)) + #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v)) + #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v)) + #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v)) + #define bcf_destroy1(v) bcf_destroy(v) + #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v)) + #define bcf_clear1(v) bcf_clear(v) + #define vcf_format1(h,v,s) vcf_format((h),(v),(s)) + + /** + * bcf_hdr_init() - create an empty BCF header. + * @param mode "r" or "w" + * + * When opened for writing, the mandatory fileFormat and + * FILTER=PASS lines are added automatically. + */ + bcf_hdr_t *bcf_hdr_init(const char *mode); + + /** Destroy a BCF header struct */ + void bcf_hdr_destroy(bcf_hdr_t *h); + + /** Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) */ + bcf1_t *bcf_init(void); + + /** Deallocate a bcf1_t object */ + void bcf_destroy(bcf1_t *v); + + /** + * Same as bcf_destroy() but frees only the memory allocated by bcf1_t, + * not the bcf1_t object itself. + */ + void bcf_empty(bcf1_t *v); + + /** + * Make the bcf1_t object ready for next read. Intended mostly for + * internal use, the user should rarely need to call this function + * directly. + */ + void bcf_clear(bcf1_t *v); + + + /** bcf_open and vcf_open mode: please see hts_open() in hts.h */ + typedef htsFile vcfFile; + #define bcf_open(fn, mode) hts_open((fn), (mode)) + #define vcf_open(fn, mode) hts_open((fn), (mode)) + #define bcf_close(fp) hts_close(fp) + #define vcf_close(fp) hts_close(fp) + + /** Reads VCF or BCF header */ + bcf_hdr_t *bcf_hdr_read(htsFile *fp); + + /** + * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed + * @samples: samples to include or exclude from file or as a comma-separated string. + * LIST|FILE .. select samples in list/file + * ^LIST|FILE .. exclude samples from list/file + * - .. include all samples + * NULL .. exclude all samples + * @is_file: @samples is a file (1) or a comma-separated list (1) + * + * The bottleneck of VCF reading is parsing of genotype fields. If the + * reader knows in advance that only subset of samples is needed (possibly + * no samples at all), the performance of bcf_read() can be significantly + * improved by calling bcf_hdr_set_samples after bcf_hdr_read(). + * The function bcf_read() will subset the VCF/BCF records automatically + * with the notable exception when reading records via bcf_itr_next(). + * In this case, bcf_subset_format() must be called explicitly, because + * bcf_readrec() does not see the header. + * + * Returns 0 on success, -1 on error or a positive integer if the list + * contains samples not present in the VCF header. In such a case, the + * return value is the index of the offending sample. + */ + int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file); + int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec); + + + /** Writes VCF or BCF header */ + int bcf_hdr_write(htsFile *fp, const bcf_hdr_t *h); + + /** Parse VCF line contained in kstring and populate the bcf1_t struct */ + int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v); + + /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */ + int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + + /** + * bcf_read() - read next VCF or BCF record + * + * Returns -1 on critical errors, 0 otherwise. On errors which are not + * critical for reading, such as missing header definitions, v->errcode is + * set to one of BCF_ERR* code and must be checked before calling + * vcf_write(). + */ + int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + + /** + * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) + * + * Note that bcf_unpack() must be called even when reading VCF. It is safe + * to call the function repeatedly, it will not unpack the same field + * twice. + */ + #define BCF_UN_STR 1 // up to ALT inclusive + #define BCF_UN_FLT 2 // up to FILTER + #define BCF_UN_INFO 4 // up to INFO + #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information + #define BCF_UN_FMT 8 // unpack format and each sample + #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT + #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything + int bcf_unpack(bcf1_t *b, int which); + + /* + * bcf_dup() - create a copy of BCF record. + * + * Note that bcf_unpack() must be called on the returned copy as if it was + * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) + * internally to reflect any changes made by bcf_update_* functions. + */ + bcf1_t *bcf_dup(bcf1_t *src); + + /** + * bcf_write() - write one VCF or BCF record. The type is determined at the open() call. + */ + int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + + /** + * The following functions work only with VCFs and should rarely be called + * directly. Usually one wants to use their bcf_* alternatives, which work + * transparently with both VCFs and BCFs. + */ + bcf_hdr_t *vcf_hdr_read(htsFile *fp); + int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h); + int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + + /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + + + + /************************************************************************** + * Header querying and manipulation routines + **************************************************************************/ + + /** Create a new header using the supplied template */ + bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr); + /** Copy header lines from src to dst if not already present in dst. See also bcf_translate(). */ + void bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src); + + /** + * bcf_hdr_add_sample() - add a new sample. + * @param sample: Sample name to be added. After all samples have been added, NULL + * must be passed to update internal header structures. + */ + int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample); + + /** Read VCF header from a file and update the header */ + int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname); + + /** Returns formatted header (newly allocated string) and its length, + * excluding the terminating \0. If is_bcf parameter is unset, IDX + * fields are discarded. + */ + char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len); + + /** Append new VCF header line, returns 0 on success */ + int bcf_hdr_append(bcf_hdr_t *h, const char *line); + int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...); + + const char *bcf_hdr_get_version(const bcf_hdr_t *hdr); + void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version); + + /** + * bcf_hdr_remove() - remove VCF header tag + * @param type: one of BCF_HL_* + * @param key: tag name + */ + void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key); + + /** + * bcf_hdr_subset() - creates a new copy of the header removing unwanted samples + * @param n: number of samples to keep + * @param samples: names of the samples to keep + * @param imap: mapping from index in @samples to the sample index in the original file + * + * Sample names not present in h0 are ignored. The number of unmatched samples can be checked + * by comparing n and bcf_hdr_nsamples(out_hdr). + * This function can be used to reorder samples. + * See also bcf_subset() which subsets individual records. + */ + bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); + + /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); + + /** Get number of samples */ + #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE] + + + /** The following functions are for internal use and should rarely be called directly */ + bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len); + void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); + bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *id); // type is one of BCF_HL_FLT,..,BCF_HL_CTG + bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec); + void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len); + void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted); + int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key); + void bcf_hrec_destroy(bcf_hrec_t *hrec); + + + + /************************************************************************** + * Individual record querying and manipulation routines + **************************************************************************/ + + /** See the description of bcf_hdr_subset() */ + int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap); + + /** + * bcf_translate() - translate tags ids to be consistent with different header. This function + * is useful when lines from multiple VCF need to be combined. + * @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() + * @src_hdr: the source header, used in bcf_read() + * @src_line: line obtained by bcf_read() + */ + int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); + + /** + * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + */ + int bcf_get_variant_types(bcf1_t *rec); + int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + int bcf_is_snp(bcf1_t *v); + + /** + * bcf_update_filter() - sets the FILTER column + * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + * @n: Number of filters. If n==0, all filters are removed + */ + int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n); + /** + * bcf_add_filter() - adds to the FILTER column + * @flt_id: filter ID to add, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + * + * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. + */ + int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id); + /** + * bcf_remove_filter() - removes from the FILTER column + * @flt_id: filter ID to remove, numeric ID returned by bcf_id2int(hdr, BCF_DT_ID, "PASS") + * @pass: when set to 1 and no filters are present, set to PASS + */ + int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass); + /** + * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. + */ + int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter); + /** + * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALLT column + * @alleles: Array of alleles + * @nals: Number of alleles + * @alleles_string: Comma-separated alleles, starting with the REF allele + */ + int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals); + int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string); + int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); + + /* + * bcf_update_info_*() - functions for updating INFO fields + * @hdr: the BCF header + * @line: VCF line to be edited + * @key: the INFO tag to be updated + * @values: pointer to the array of values. Pass NULL to remove the tag. + * @n: number of values in the array. When set to 0, the INFO tag is removed + * + * The @string in bcf_update_info_flag() is optional, @n indicates whether + * the flag is set or removed. + * + * Returns 0 on success or negative value on error. + */ + #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) + #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) + #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG) + #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) + int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + + /* + * bcf_update_format_*() - functions for updating FORMAT fields + * @values: pointer to the array of values, the same number of elements + * is expected for each sample. Missing values must be padded + * with bcf_*_missing or bcf_*_vector_end values. + * @n: number of values in the array. If n==0, existing tag is removed. + * + * The function bcf_update_format_string() is a higher-level (slower) variant of + * bcf_update_format_char(). The former accepts array of \0-terminated strings + * whereas the latter requires that the strings are collapsed into a single array + * of fixed-length strings. In case of strings with variable length, shorter strings + * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() + * are not \0-terminated. + * + * Returns 0 on success or negative value on error. + */ + #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT) + #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL) + #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR) + #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below + int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n); + int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + + // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds + // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained + // from bcf_get_genotypes() below. + #define bcf_gt_phased(idx) ((idx+1)<<1|1) + #define bcf_gt_unphased(idx) ((idx+1)<<1) + #define bcf_gt_missing 0 + #define bcf_gt_is_phased(idx) ((idx)&1) + #define bcf_gt_allele(val) (((val)>>1)-1) + + /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */ + #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a))) + static inline void bcf_gt2alleles(int igt, int *a, int *b) + { + int k = 0, dk = 1; + while ( k 0 ) + * for (i=0; iid[type][int_id].key) + + /** + * bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID + * bcf_hdr_id2name() - Translates numeric ID to sequence name + */ + static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); } + static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) { return hdr->id[BCF_DT_CTG][rid].key; } + static inline const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) { return hdr->id[BCF_DT_CTG][rec->rid].key; } + + /** + * bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t + * @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT + * @int_id: return value of bcf_id2int, must be >=0 + * + * The returned values are: + * bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* + * bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields + * bcf_hdr_id2type .. the field type, one of BCF_HT_* + * bcf_hdr_id2coltype .. the column type, one of BCF_HL_* + * + * Notes: Prior to using the macros, the presence of the info should be + * tested with bcf_hdr_idinfo_exists(). + */ + #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf) + #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) + #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) + #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) + #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1) + #define bcf_hdr_id2hrec(hdr,type,int_id) ((hdr)->id[(type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(type)==BCF_DT_CTG?0:type]) + + void bcf_fmt_array(kstring_t *s, int n, int type, void *data); + uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr); + + void bcf_enc_vchar(kstring_t *s, int l, const char *a); + void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize); + void bcf_enc_vfloat(kstring_t *s, int n, float *a); + + + /************************************************************************** + * BCF index + * + * Note that these functions work with BCFs only. See synced_bcf_reader.h + * which provides (amongst other things) an API to work transparently with + * both indexed BCFs and VCFs. + **************************************************************************/ + + #define bcf_itr_destroy(iter) hts_itr_destroy(iter) + #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec) + #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec) + #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) + #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI) + #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr)) + + int bcf_index_build(const char *fn, int min_shift); + +#ifdef __cplusplus +} +#endif + +/******************* + * Typed value I/O * + *******************/ + +/* + Note that in contrast with BCFv2.1 specification, HTSlib implementation + allows missing values in vectors. For integer types, the values 0x80, + 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, + 0x80000001 as end-of-vector indicators. Similarly for floats, the value of + 0x7F800001 is interpreted as a missing value and 0x7F800002 as an + end-of-vector indicator. + Note that the end-of-vector byte is not part of the vector. + + This trial BCF version (v2.2) is compatible with the VCF specification and + enables to handle correctly vectors with different ploidy in presence of + missing values. + */ +#define bcf_int8_vector_end (INT8_MIN+1) +#define bcf_int16_vector_end (INT16_MIN+1) +#define bcf_int32_vector_end (INT32_MIN+1) +#define bcf_str_vector_end 0 +#define bcf_int8_missing INT8_MIN +#define bcf_int16_missing INT16_MIN +#define bcf_int32_missing INT32_MIN +#define bcf_str_missing 0x07 +extern uint32_t bcf_float_vector_end; +extern uint32_t bcf_float_missing; +static inline void bcf_float_set(float *ptr, uint32_t value) +{ + union { uint32_t i; float f; } u; + u.i = value; + *ptr = u.f; +} +#define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end) +#define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing) +static inline int bcf_float_is_missing(float f) +{ + union { uint32_t i; float f; } u; + u.f = f; + return u.i==bcf_float_missing ? 1 : 0; +} +static inline int bcf_float_is_vector_end(float f) +{ + union { uint32_t i; float f; } u; + u.f = f; + return u.i==bcf_float_vector_end ? 1 : 0; +} + +static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + #define BRANCH(type_t, missing, vector_end) { \ + type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ + int i; \ + for (i=0; in && ptr[i]!=vector_end; i++) \ + { \ + if ( i ) kputc("/|"[ptr[i]&1], str); \ + if ( !(ptr[i]>>1) ) kputc('.', str); \ + else kputw((ptr[i]>>1) - 1, str); \ + } \ + if (i == 0) kputc('.', str); \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; + } + #undef BRANCH +} + +static inline void bcf_enc_size(kstring_t *s, int size, int type) +{ + if (size >= 15) { + kputc(15<<4|type, s); + if (size >= 128) { + if (size >= 32768) { + int32_t x = size; + kputc(1<<4|BCF_BT_INT32, s); + kputsn((char*)&x, 4, s); + } else { + int16_t x = size; + kputc(1<<4|BCF_BT_INT16, s); + kputsn((char*)&x, 2, s); + } + } else { + kputc(1<<4|BCF_BT_INT8, s); + kputc(size, s); + } + } else kputc(size<<4|type, s); +} + +static inline int bcf_enc_inttype(long x) +{ + if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8; + if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16; + return BCF_BT_INT32; +} + +static inline void bcf_enc_int1(kstring_t *s, int32_t x) +{ + if (x == bcf_int32_vector_end) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(bcf_int8_vector_end, s); + } else if (x == bcf_int32_missing) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(bcf_int8_missing, s); + } else if (x <= INT8_MAX && x > bcf_int8_missing) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(x, s); + } else if (x <= INT16_MAX && x > bcf_int16_missing) { + int16_t z = x; + bcf_enc_size(s, 1, BCF_BT_INT16); + kputsn((char*)&z, 2, s); + } else { + int32_t z = x; + bcf_enc_size(s, 1, BCF_BT_INT32); + kputsn((char*)&z, 4, s); + } +} + +static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) +{ + if (type == BCF_BT_INT8) { + *q = (uint8_t*)p + 1; + return *(int8_t*)p; + } else if (type == BCF_BT_INT16) { + *q = (uint8_t*)p + 2; + return *(int16_t*)p; + } else { + *q = (uint8_t*)p + 4; + return *(int32_t*)p; + } +} + +static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) +{ + return bcf_dec_int1(p + 1, *p&0xf, q); +} + +static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) +{ + *type = *p & 0xf; + if (*p>>4 != 15) { + *q = (uint8_t*)p + 1; + return *p>>4; + } else return bcf_dec_typed_int1(p + 1, q); +} + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/vcf_sweep.h b/star-sys/STAR/source/htslib/htslib/vcf_sweep.h new file mode 100644 index 0000000..9d21d8a --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/vcf_sweep.h @@ -0,0 +1,15 @@ +#ifndef __VCF_SWEEP_H__ +#define __VCF_SWEEP_H__ + +#include "hts.h" +#include "vcf.h" + +typedef struct _bcf_sweep_t bcf_sweep_t; + +bcf_sweep_t *bcf_sweep_init(const char *fname); +void bcf_sweep_destroy(bcf_sweep_t *sw); +bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw); +bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw); +bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw); + +#endif diff --git a/star-sys/STAR/source/htslib/htslib/vcfutils.h b/star-sys/STAR/source/htslib/htslib/vcfutils.h new file mode 100644 index 0000000..18b9503 --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib/vcfutils.h @@ -0,0 +1,94 @@ +/* + Time will show if this module will be merged into others + or perhaps removed completely. +*/ +#ifndef VCF_UTILS_H +#define VCF_UTILS_H + +#include "vcf.h" + + +/** + * bcf_trim_alleles() - remove ALT alleles unused in genotype fields + * @header: for access to BCF_DT_ID dictionary + * @line: VCF line obtain from vcf_parse1 + * + * Returns the number of removed alleles. + * + * todo: BCF output + */ +int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line); + + +/** + * bcf_remove_alleles() - remove ALT alleles according to bitmask @mask + * @header: for access to BCF_DT_ID dictionary + * @line: VCF line obtained from vcf_parse1 + * @mask: alleles to remove + * + * todo: BCF output + */ +void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask); + + +/** + * bcf_calc_ac() - calculate the number of REF and ALT alleles + * @header: for access to BCF_DT_ID dictionary + * @line: VCF line obtained from vcf_parse1 + * @ac: array of length line->n_allele + * @which: determine if INFO/AN,AC and indv fields be used + * + * Returns 1 if the call succeeded, or 0 if the value could not + * be determined. + * + * The value of @which determines if existing INFO/AC,AN can be + * used (BCF_UN_INFO) and and if indv fields can be splitted + * (BCF_UN_FMT). + */ +int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which); + + +/** + * bcf_gt_type() - determines type of the genotype + * @fmt_ptr: the GT format field as set for example by set_fmt_ptr + * @isample: sample index (starting from 0) + * @ial: index of the 1st non-reference allele (starting from 1) + * @jal: index of the 2nd non-reference allele (starting from 1) + * + * Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, + * GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial + * is not NULL and the genotype has one or more non-reference + * alleles, $ial will be set. In case of GT_HET_AA, $ial is the + * position of the allele which appeared first in ALT. If $jal is + * not null and the genotype is GT_HET_AA, $jal will be set and is + * the position of the second allele in ALT. + */ +#define GT_HOM_RR 0 // note: the actual value of GT_* matters, used in dosage r2 calculation +#define GT_HOM_AA 1 +#define GT_HET_RA 2 +#define GT_HET_AA 3 +#define GT_HAPL_R 4 +#define GT_HAPL_A 5 +#define GT_UNKN 6 +int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal); + +static inline int bcf_acgt2int(char c) +{ + if ( (int)c>96 ) c -= 32; + if ( c=='A' ) return 0; + if ( c=='C' ) return 1; + if ( c=='G' ) return 2; + if ( c=='T' ) return 3; + return -1; +} +#define bcf_int2acgt(i) "ACGT"[i] + +/** + * bcf_ij2G() - common task: allele indexes to Number=G index (diploid) + * @i,j: allele indexes, 0-based, i<=j + * + * Returns index to the Number=G diploid array + */ +#define bcf_ij2G(i, j) ((j)*((j)+1)/2+(i)) + +#endif diff --git a/star-sys/STAR/source/htslib/htslib_vars.mk b/star-sys/STAR/source/htslib/htslib_vars.mk new file mode 100644 index 0000000..73ac5eb --- /dev/null +++ b/star-sys/STAR/source/htslib/htslib_vars.mk @@ -0,0 +1,20 @@ +# Makefile variables useful for third-party code using htslib's public API. +# +# Copyright (C) 2013-2014 Genome Research Ltd. +# +# Author: John Marshall + +# These variables can be used to express dependencies on htslib headers. +# See htslib.mk for details. + +htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h +htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h +htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) +htslib_hts_h = $(HTSPREFIX)htslib/hts.h +htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h +htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) +htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) +htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) +htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h +htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h) +htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h) diff --git a/star-sys/STAR/source/htslib/kfunc.c b/star-sys/STAR/source/htslib/kfunc.c new file mode 100644 index 0000000..10c0973 --- /dev/null +++ b/star-sys/STAR/source/htslib/kfunc.c @@ -0,0 +1,254 @@ +#include +#include +#include "htslib/kfunc.h" + +/* Log gamma function + * \log{\Gamma(z)} + * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 + */ +double kf_lgamma(double z) +{ + double x = 0; + x += 0.1659470187408462e-06 / (z+7); + x += 0.9934937113930748e-05 / (z+6); + x -= 0.1385710331296526 / (z+5); + x += 12.50734324009056 / (z+4); + x -= 176.6150291498386 / (z+3); + x += 771.3234287757674 / (z+2); + x -= 1259.139216722289 / (z+1); + x += 676.5203681218835 / z; + x += 0.9999999999995183; + return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); +} + +/* complementary error function + * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt + * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 + */ +double kf_erfc(double x) +{ + const double p0 = 220.2068679123761; + const double p1 = 221.2135961699311; + const double p2 = 112.0792914978709; + const double p3 = 33.912866078383; + const double p4 = 6.37396220353165; + const double p5 = .7003830644436881; + const double p6 = .03526249659989109; + const double q0 = 440.4137358247522; + const double q1 = 793.8265125199484; + const double q2 = 637.3336333788311; + const double q3 = 296.5642487796737; + const double q4 = 86.78073220294608; + const double q5 = 16.06417757920695; + const double q6 = 1.755667163182642; + const double q7 = .08838834764831844; + double expntl, z, p; + z = fabs(x) * M_SQRT2; + if (z > 37.) return x > 0.? 0. : 2.; + expntl = exp(z * z * - .5); + if (z < 10. / M_SQRT2) // for small z + p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0) + / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0); + else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))); + return x > 0.? 2. * p : 2. * (1. - p); +} + +/* The following computes regularized incomplete gamma functions. + * Formulas are taken from Wiki, with additional input from Numerical + * Recipes in C (for modified Lentz's algorithm) and AS245 + * (http://lib.stat.cmu.edu/apstat/245). + * + * A good online calculator is available at: + * + * http://www.danielsoper.com/statcalc/calc23.aspx + * + * It calculates upper incomplete gamma function, which equals + * kf_gammaq(s,z)*tgamma(s). + */ + +#define KF_GAMMA_EPS 1e-14 +#define KF_TINY 1e-290 + +// regularized lower incomplete gamma function, by series expansion +static double _kf_gammap(double s, double z) +{ + double sum, x; + int k; + for (k = 1, sum = x = 1.; k < 100; ++k) { + sum += (x *= z / (s + k)); + if (x / sum < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); +} +// regularized upper incomplete gamma function, by continued fraction +static double _kf_gammaq(double s, double z) +{ + int j; + double C, D, f; + f = 1. + z - s; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + // See Numerical Recipes in C, 2nd edition, section 5.2 + for (j = 1; j < 100; ++j) { + double a = j * (s - j), b = (j<<1) + 1 + z - s, d; + D = b + a * D; + if (D < KF_TINY) D = KF_TINY; + C = b + a / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s) - log(f)); +} + +double kf_gammap(double s, double z) +{ + return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z); +} + +double kf_gammaq(double s, double z) +{ + return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z); +} + +/* Regularized incomplete beta function. The method is taken from + * Numerical Recipe in C, 2nd edition, section 6.4. The following web + * page calculates the incomplete beta function, which equals + * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): + * + * http://www.danielsoper.com/statcalc/calc36.aspx + */ +static double kf_betai_aux(double a, double b, double x) +{ + double C, D, f; + int j; + if (x == 0.) return 0.; + if (x == 1.) return 1.; + f = 1.; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + for (j = 1; j < 200; ++j) { + double aa, d; + int m = j>>1; + aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1)) + : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m)); + D = 1. + aa * D; + if (D < KF_TINY) D = KF_TINY; + C = 1. + aa / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; +} +double kf_betai(double a, double b, double x) +{ + return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x); +} + +#ifdef KF_MAIN +#include +int main(int argc, char *argv[]) +{ + double x = 5.5, y = 3; + double a, b; + printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x)); + printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y)); + a = 2; b = 2; x = 0.5; + printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b))); + return 0; +} +#endif + + +// log\binom{n}{k} +static double lbinom(int n, int k) +{ + if (k == 0 || n == k) return 0; + return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); +} + +// n11 n12 | n1_ +// n21 n22 | n2_ +//-----------+---- +// n_1 n_2 | n + +// hypergeometric distribution +static double hypergeo(int n11, int n1_, int n_1, int n) +{ + return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); +} + +typedef struct { + int n11, n1_, n_1, n; + double p; +} hgacc_t; + +// incremental version of hypergenometric distribution +static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) +{ + if (n1_ || n_1 || n) { + aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; + } else { // then only n11 changed; the rest fixed + if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { + if (n11 == aux->n11 + 1) { // incremental + aux->p *= (double)(aux->n1_ - aux->n11) / n11 + * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); + aux->n11 = n11; + return aux->p; + } + if (n11 == aux->n11 - 1) { // incremental + aux->p *= (double)aux->n11 / (aux->n1_ - n11) + * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); + aux->n11 = n11; + return aux->p; + } + } + aux->n11 = n11; + } + aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); + return aux->p; +} + +double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two) +{ + int i, j, max, min; + double p, q, left, right; + hgacc_t aux; + int n1_, n_1, n; + + n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n + max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail + min = n1_ + n_1 - n; // not sure why n11-n22 is used instead of min(n_1,n1_) + if (min < 0) min = 0; // min n11, for left tail + *two = *_left = *_right = 1.; + if (min == max) return 1.; // no need to do test + q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table + // left tail + p = hypergeo_acc(min, 0, 0, 0, &aux); + for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow + left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); + --i; + if (p < 1.00000001 * q) left += p; + else --i; + // right tail + p = hypergeo_acc(max, 0, 0, 0, &aux); + for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow + right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); + ++j; + if (p < 1.00000001 * q) right += p; + else ++j; + // two-tail + *two = left + right; + if (*two > 1.) *two = 1.; + // adjust left and right + if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; + else left = 1.0 - right + q; + *_left = left; *_right = right; + return q; +} + + + diff --git a/star-sys/STAR/source/htslib/knetfile.c b/star-sys/STAR/source/htslib/knetfile.c new file mode 100644 index 0000000..64b3fa6 --- /dev/null +++ b/star-sys/STAR/source/htslib/knetfile.c @@ -0,0 +1,622 @@ +/* The MIT License + + Copyright (c) 2008 by Genome Research Ltd (GRL). + 2010 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -ac */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#include +#include +#endif + +#include "htslib/knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, (void*)((char*)buf + l), rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = (char*)realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + int len = strlen(cmd); + if ( netwrite(ftp->ctrl_fd, cmd, len) != len ) return -1; + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = (char*)calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = (char*)calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->http_host = (char*)calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + if ( netwrite(fp->fd, buf, l) != l ) return -1; + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +ssize_t knet_read(knetFile *fp, void *buf, size_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + size_t rest = len; + ssize_t curr; + while (rest) { + do { + curr = read(fp->fd, (void*)((char*)buf + l), rest); + } while (curr < 0 && EINTR == errno); + if (curr < 0) return -1; + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, off_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) return -1; + fp->offset = offset; + return fp->offset; + } else if (fp->type == KNF_TYPE_FTP) { + if (whence == SEEK_CUR) fp->offset += off; + else if (whence == SEEK_SET) fp->offset = off; + else if (whence == SEEK_END) fp->offset = fp->file_size + off; + else return -1; + fp->is_ready = 0; + return fp->offset; + } else if (fp->type == KNF_TYPE_HTTP) { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence == SEEK_CUR) fp->offset += off; + else if (whence == SEEK_SET) fp->offset = off; + else return -1; + fp->is_ready = 0; + return fp->offset; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif diff --git a/star-sys/STAR/source/htslib/kstring.c b/star-sys/STAR/source/htslib/kstring.c new file mode 100644 index 0000000..b4202f5 --- /dev/null +++ b/star-sys/STAR/source/htslib/kstring.c @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include "htslib/kstring.h" + +int kvsprintf(kstring_t *s, const char *fmt, va_list ap) +{ + va_list args; + int l; + va_copy(args, ap); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(args); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_copy(args, ap); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); + va_end(args); + } + s->l += l; + return l; +} + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = kvsprintf(s, fmt, ap); + va_end(ap); + return l; +} + +char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +{ + const char *p, *start; + if (sep) { // set up the table + if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + aux->finished = 0; + if (sep[1]) { + aux->sep = -1; + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); + } else aux->sep = sep[0]; + } + if (aux->finished) return 0; + else if (str) aux->p = str - 1, aux->finished = 0; + if (aux->sep < 0) { + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; + } else { + for (p = start = aux->p + 1; *p; ++p) + if (*p == aux->sep) break; + } + aux->p = p; // end of token + if (*p == 0) aux->finished = 1; // no more tokens + return (char*)start; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + int *tmp; \ + max = max? max<<1 : 2; \ + if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \ + offsets = tmp; \ + } else { \ + free(offsets); \ + *_offsets = NULL; \ + return 0; \ + } \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +typedef unsigned char ubyte_t; + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +static int *ksBM_prep(const ubyte_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = (int*)calloc(m + 256, sizeof(int)); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = (int*)calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) +{ + int i, j, *prep = 0, *bmGs, *bmBc; + const ubyte_t *str, *pat; + str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; + prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (_prep && *_prep == 0) *_prep = prep; + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i >= 0) { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } else return (void*)(str + j); + } + if (_prep == 0) free(prep); + return 0; +} + +char *kstrstr(const char *str, const char *pat, int **_prep) +{ + return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); +} + +char *kstrnstr(const char *str, const char *pat, int n, int **_prep) +{ + return (char*)kmemmem(str, n, pat, strlen(pat), _prep); +} + +/*********************** + * The main() function * + ***********************/ + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + ks_tokaux_t aux; + char *p; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + // test kstrtok() + s->l = 0; + for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { + kputsn(p, aux.p - p, s); + kputc('\n', s); + } + printf("%s", s->s); + // free + free(s->s); free(s); free(fields); + + { + static char *str = "abcdefgcdgcagtcakcdcd"; + static char *pat = "cd"; + char *ret, *s = str; + int *prep = 0; + while ((ret = kstrstr(s, pat, &prep)) != 0) { + printf("match: %s\n", ret); + s = ret + prep[0]; + } + free(prep); + } + return 0; +} +#endif diff --git a/star-sys/STAR/source/htslib/sam.5 b/star-sys/STAR/source/htslib/sam.5 new file mode 100644 index 0000000..4c86aa8 --- /dev/null +++ b/star-sys/STAR/source/htslib/sam.5 @@ -0,0 +1,45 @@ +'\" t +.TH sam 5 "August 2013" "htslib" "Bioinformatics formats" +.SH NAME +sam \- Sequence Alignment/Map file format +.SH DESCRIPTION +Sequence Alignment/Map (SAM) format is TAB-delimited. Apart from the header lines, which are started +with the `@' symbol, each alignment line consists of: +.TS +nlbl. +1 QNAME Query template/pair NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIGAR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 TLEN inferred Template LENgth (insert size) +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12+ OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE +.PP +Each bit in the FLAG field is defined as: +.TS +lcbl. +0x0001 p the read is paired in sequencing +0x0002 P the read is mapped in a proper pair +0x0004 u the query sequence itself is unmapped +0x0008 U the mate is unmapped +0x0010 r strand of the query (1 for reverse) +0x0020 R strand of the mate +0x0040 1 the read is the first read in a pair +0x0080 2 the read is the second read in a pair +0x0100 s the alignment is not primary +0x0200 f the read fails platform/vendor quality checks +0x0400 d the read is either a PCR or an optical duplicate +0x0800 S the alignment is supplementary +.TE +.P +where the second column gives the string representation of the FLAG field. +.SH SEE ALSO +.TP +https://github.com/samtools/hts-specs +The full SAM/BAM file format specification diff --git a/star-sys/STAR/source/htslib/sam.c b/star-sys/STAR/source/htslib/sam.c new file mode 100644 index 0000000..9c9112d --- /dev/null +++ b/star-sys/STAR/source/htslib/sam.c @@ -0,0 +1,1797 @@ +#include +#include +#include +#include +#include +#include +#include "htslib/sam.h" +#include "htslib/bgzf.h" +#include "cram/cram.h" +#include "htslib/hfile.h" + +#include "htslib/khash.h" +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) + +typedef khash_t(s2i) sdict_t; + +/********************** + *** BAM header I/O *** + **********************/ + +bam_hdr_t *bam_hdr_init() +{ + return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t)); +} + +void bam_hdr_destroy(bam_hdr_t *h) +{ + int32_t i; + if (h == NULL) return; + if (h->target_name) { + for (i = 0; i < h->n_targets; ++i) + free(h->target_name[i]); + free(h->target_name); + free(h->target_len); + } + free(h->text); free(h->cigar_tab); + if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict); + free(h); +} + +bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0) +{ + if (h0 == NULL) return NULL; + bam_hdr_t *h; + if ((h = bam_hdr_init()) == NULL) return NULL; + // copy the simple data + h->n_targets = h0->n_targets; + h->ignore_sam_err = h0->ignore_sam_err; + h->l_text = h0->l_text; + // Then the pointery stuff + h->cigar_tab = NULL; + h->sdict = NULL; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); + int i; + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + return h; +} + + +static bam_hdr_t *hdr_from_dict(sdict_t *d) +{ + bam_hdr_t *h; + khint_t k; + h = bam_hdr_init(); + h->sdict = d; + h->n_targets = kh_size(d); + h->target_len = (uint32_t*)malloc(4 * h->n_targets); + h->target_name = (char**)malloc(sizeof(char*) * h->n_targets); + for (k = kh_begin(d); k != kh_end(d); ++k) { + if (!kh_exist(d, k)) continue; + h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k); + h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32; + kh_val(d, k) >>= 32; + } + return h; +} + +bam_hdr_t *bam_hdr_read(BGZF *fp) +{ + bam_hdr_t *h; + char buf[4]; + int magic_len, has_EOF; + int32_t i = 1, name_len; + // check EOF + has_EOF = bgzf_check_EOF(fp); + if (has_EOF < 0) { + perror("[W::sam_hdr_read] bgzf_check_EOF"); + } else if (has_EOF == 0 && hts_verbose >= 2) + fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__); + // read "BAM1" + magic_len = bgzf_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) { + if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__); + return 0; + } + h = bam_hdr_init(); + // read plain text and the number of reference sequences + bgzf_read(fp, &h->l_text, 4); + if (fp->is_be) ed_swap_4p(&h->l_text); + h->text = (char*)malloc(h->l_text + 1); + h->text[h->l_text] = 0; // make sure it is NULL terminated + bgzf_read(fp, h->text, h->l_text); + bgzf_read(fp, &h->n_targets, 4); + if (fp->is_be) ed_swap_4p(&h->n_targets); + // read reference sequence names and lengths + h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + for (i = 0; i != h->n_targets; ++i) { + bgzf_read(fp, &name_len, 4); + if (fp->is_be) ed_swap_4p(&name_len); + h->target_name[i] = (char*)calloc(name_len, 1); + bgzf_read(fp, h->target_name[i], name_len); + bgzf_read(fp, &h->target_len[i], 4); + if (fp->is_be) ed_swap_4p(&h->target_len[i]); + } + return h; +} + +int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) +{ + char buf[4]; + int32_t i, name_len, x; + // write "BAM1" + strncpy(buf, "BAM\1", 4); + bgzf_write(fp, buf, 4); + // write plain text and the number of reference sequences + if (fp->is_be) { + x = ed_swap_4(h->l_text); + bgzf_write(fp, &x, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); + x = ed_swap_4(h->n_targets); + bgzf_write(fp, &x, 4); + } else { + bgzf_write(fp, &h->l_text, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); + bgzf_write(fp, &h->n_targets, 4); + } + // write sequence names and lengths + for (i = 0; i != h->n_targets; ++i) { + char *p = h->target_name[i]; + name_len = strlen(p) + 1; + if (fp->is_be) { + x = ed_swap_4(name_len); + bgzf_write(fp, &x, 4); + } else bgzf_write(fp, &name_len, 4); + bgzf_write(fp, p, name_len); + if (fp->is_be) { + x = ed_swap_4(h->target_len[i]); + bgzf_write(fp, &x, 4); + } else bgzf_write(fp, &h->target_len[i], 4); + } + bgzf_flush(fp); + return 0; +} + +int bam_name2id(bam_hdr_t *h, const char *ref) +{ + sdict_t *d = (sdict_t*)h->sdict; + khint_t k; + if (h->sdict == 0) { + int i, absent; + d = kh_init(s2i); + for (i = 0; i < h->n_targets; ++i) { + k = kh_put(s2i, d, h->target_name[i], &absent); + kh_val(d, k) = i; + } + h->sdict = d; + } + k = kh_get(s2i, d, ref); + return k == kh_end(d)? -1 : kh_val(d, k); +} + +/************************* + *** BAM alignment I/O *** + *************************/ + +bam1_t *bam_init1() +{ + return (bam1_t*)calloc(1, sizeof(bam1_t)); +} + +void bam_destroy1(bam1_t *b) +{ + if (b == 0) return; + free(b->data); free(b); +} + +bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +{ + uint8_t *data = bdst->data; + int m_data = bdst->m_data; // backup data and m_data + if (m_data < bsrc->l_data) { // double the capacity + m_data = bsrc->l_data; kroundup32(m_data); + data = (uint8_t*)realloc(data, m_data); + } + memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data + *bdst = *bsrc; // copy the rest + // restore the backup + bdst->m_data = m_data; + bdst->data = data; + return bdst; +} + +bam1_t *bam_dup1(const bam1_t *bsrc) +{ + if (bsrc == NULL) return NULL; + bam1_t *bdst = bam_init1(); + if (bdst == NULL) return NULL; + return bam_copy1(bdst, bsrc); +} + +int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) + if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) + l += bam_cigar_oplen(cigar[k]); + return l; +} + +int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) + if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) + l += bam_cigar_oplen(cigar[k]); + return l; +} + +int32_t bam_endpos(const bam1_t *b) +{ + if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) + return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + else + return b->core.pos + 1; +} + +static inline int aux_type2size(uint8_t type) +{ + switch (type) { + case 'A': case 'c': case 'C': + return 1; + case 's': case 'S': + return 2; + case 'i': case 'I': case 'f': + return 4; + case 'd': + return 8; + case 'Z': case 'H': case 'B': + return type; + default: + return 0; + } +} + +static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) +{ + uint8_t *s; + uint32_t *cigar = (uint32_t*)(data + c->l_qname); + uint32_t i, n; + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); + while (s < data + l_data) { + int size; + s += 2; // skip key + size = aux_type2size(*s); ++s; // skip type + switch (size) { + case 1: ++s; break; + case 2: ed_swap_2p(s); s += 2; break; + case 4: ed_swap_4p(s); s += 4; break; + case 8: ed_swap_8p(s); s += 8; break; + case 'Z': + case 'H': + while (*s) ++s; + ++s; + break; + case 'B': + size = aux_type2size(*s); ++s; + if (is_host) memcpy(&n, s, 4), ed_swap_4p(s); + else ed_swap_4p(s), memcpy(&n, s, 4); + s += 4; + switch (size) { + case 1: s += n; break; + case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break; + case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break; + case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break; + } + break; + } + } +} + +int bam_read1(BGZF *fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bgzf_read(fp, x, 32) != 32) return -3; + if (fp->is_be) { + ed_swap_4p(&block_len); + for (i = 0; i < 8; ++i) ed_swap_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->l_data = block_len - 32; + if (b->l_data < 0 || c->l_qseq < 0) return -4; + if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data) + return -4; + if (b->m_data < b->l_data) { + b->m_data = b->l_data; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + if (!b->data) + return -4; + } + if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4; + //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + return 4 + block_len; +} + +int bam_write1(BGZF *fp, const bam1_t *b) +{ + const bam1_core_t *c = &b->core; + uint32_t x[8], block_len = b->l_data + 32, y; + int i, ok; + x[0] = c->tid; + x[1] = c->pos; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + x[4] = c->l_qseq; + x[5] = c->mtid; + x[6] = c->mpos; + x[7] = c->isize; + ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); + if (fp->is_be) { + for (i = 0; i < 8; ++i) ed_swap_4p(x + i); + y = block_len; + if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + swap_data(c, b->l_data, b->data, 1); + } else { + if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); + } + if (ok) ok = (bgzf_write(fp, x, 32) >= 0); + if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0); + if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + return ok? 4 + block_len : -1; +} + +/******************** + *** BAM indexing *** + ********************/ + +static hts_idx_t *bam_index(BGZF *fp, int min_shift) +{ + int n_lvls, i, fmt; + bam1_t *b; + hts_idx_t *idx; + bam_hdr_t *h; + h = bam_hdr_read(fp); + if (min_shift > 0) { + int64_t max_len = 0, s; + for (i = 0; i < h->n_targets; ++i) + if (max_len < h->target_len[i]) max_len = h->target_len[i]; + max_len += 256; + for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + fmt = HTS_FMT_CSI; + } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; + idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls); + bam_hdr_destroy(h); + b = bam_init1(); + while (bam_read1(fp, b) >= 0) { + int l, ret; + l = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + if (l == 0) l = 1; // no zero-length records + ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP)); + if (ret < 0) + { + // unsorted + bam_destroy1(b); + hts_idx_destroy(idx); + return NULL; + } + } + hts_idx_finish(idx, bgzf_tell(fp)); + bam_destroy1(b); + return idx; +} + +int bam_index_build(const char *fn, int min_shift) +{ + hts_idx_t *idx; + htsFile *fp; + int ret = 0; + + if ((fp = hts_open(fn, "r")) == 0) return -1; + if (fp->is_cram) { + ret = cram_index_build(fp->fp.cram, fn); + } else { + idx = bam_index(fp->fp.bgzf, min_shift); + if ( !idx ) + { + hts_close(fp); + return -1; + } + hts_idx_save(idx, fn, min_shift > 0 + ? HTS_FMT_CSI : HTS_FMT_BAI); + hts_idx_destroy(idx); + } + hts_close(fp); + + return ret; +} + +static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end) +{ + bam1_t *b = bv; + int ret; + if ((ret = bam_read1(fp, b)) >= 0) { + *tid = b->core.tid; *beg = b->core.pos; + *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1); + } + return ret; +} + +// This is used only with read_rest=1 iterators, so need not set tid/beg/end. +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +{ + htsFile *fp = fpv; + bam1_t *b = bv; + return cram_get_bam_seq(fp->fp.cram, &b); +} + +// This is used only with read_rest=1 iterators, so need not set tid/beg/end. +static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end) +{ + htsFile *fp = fpv; + bam1_t *b = bv; + if (fp->is_bin) return bam_read1(bgzfp, b); + else if (fp->is_cram) return cram_get_bam_seq(fp->fp.cram, &b); + else { + // TODO Need headers available to implement this for SAM files + fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n"); + abort(); + } +} + +// The CRAM implementation stores the loaded index within the cram_fd rather +// than separately as is done elsewhere in htslib. So if p is a pointer to +// an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an +// hts_cram_idx_t and should be cast accordingly. +typedef struct hts_cram_idx_t { + int fmt; + cram_fd *cram; +} hts_cram_idx_t; + +hts_idx_t *sam_index_load(samFile *fp, const char *fn) +{ + if (fp->is_bin) return bam_index_load(fn); + else if (fp->is_cram) { + if (cram_index_load(fp->fp.cram, fn) < 0) return NULL; + // Cons up a fake "index" just pointing at the associated cram_fd: + hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); + if (idx == NULL) return NULL; + idx->fmt = HTS_FMT_CRAI; + idx->cram = fp->fp.cram; + return (hts_idx_t *) idx; + } + else return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t +} + +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); + if (iter == NULL) return NULL; + + // Cons up a dummy iterator for which hts_itr_next() will simply invoke + // the readrec function: + iter->read_rest = 1; + iter->off = NULL; + iter->bins.a = NULL; + iter->readrec = readrec; + + if (tid >= 0) { + cram_range r = { tid, beg+1, end }; + if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; } + iter->curr_off = 0; + // The following fields are not required by hts_itr_next(), but are + // filled in in case user code wants to look at them. + iter->tid = tid; + iter->beg = beg; + iter->end = end; + } + else switch (tid) { + case HTS_IDX_REST: + iter->curr_off = 0; + break; + case HTS_IDX_NONE: + iter->curr_off = 0; + iter->finished = 1; + break; + default: + fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid); + abort(); + break; + } + + return iter; +} + +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + if (idx == NULL) + return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec); + else if (cidx->fmt == HTS_FMT_CRAI) + return cram_itr_query(idx, tid, beg, end, cram_readrec); + else + return hts_itr_query(idx, tid, beg, end, bam_readrec); +} + +static int cram_name2id(void *fdv, const char *ref) +{ + cram_fd *fd = (cram_fd *) fdv; + return sam_hdr_name2ref(fd->header, ref); +} + +hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + if (cidx->fmt == HTS_FMT_CRAI) + return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec); + else + return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec); +} + +/********************** + *** SAM header I/O *** + **********************/ + +#include "htslib/kseq.h" +#include "htslib/kstring.h" + +bam_hdr_t *sam_hdr_parse(int l_text, const char *text) +{ + const char *q, *r, *p; + khash_t(s2i) *d; + d = kh_init(s2i); + for (p = text; *p; ++p) { + if (strncmp(p, "@SQ", 3) == 0) { + char *sn = 0; + int ln = -1; + for (q = p + 4;; ++q) { + if (strncmp(q, "SN:", 3) == 0) { + q += 3; + for (r = q; *r != '\t' && *r != '\n'; ++r); + sn = (char*)calloc(r - q + 1, 1); + strncpy(sn, q, r - q); + q = r; + } else if (strncmp(q, "LN:", 3) == 0) + ln = strtol(q + 3, (char**)&q, 10); + while (*q != '\t' && *q != '\n') ++q; + if (*q == '\n') break; + } + p = q; + if (sn && ln >= 0) { + khint_t k; + int absent; + k = kh_put(s2i, d, sn, &absent); + if (!absent) { + if (hts_verbose >= 2) + fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn); + free(sn); + } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln; + } + } + while (*p != '\n') ++p; + } + return hdr_from_dict(d); +} + +bam_hdr_t *sam_hdr_read(htsFile *fp) +{ + if (fp->is_bin) { + return bam_hdr_read(fp->fp.bgzf); + } else if (fp->is_cram) { + return cram_header_to_bam(fp->fp.cram->header); + } else { + kstring_t str; + bam_hdr_t *h; + int has_SQ = 0; + str.l = str.m = 0; str.s = 0; + while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) { + if (fp->line.s[0] != '@') break; + if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1; + kputsn(fp->line.s, fp->line.l, &str); + kputc('\n', &str); + } + if (! has_SQ && fp->fn_aux) { + char line[2048]; + FILE *f = fopen(fp->fn_aux, "r"); + if (f == NULL) return NULL; + while (fgets(line, sizeof line, f)) { + const char *name = strtok(line, "\t"); + const char *length = strtok(NULL, "\t"); + ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length); + } + fclose(f); + } + if (str.l == 0) kputsn("", 0, &str); + h = sam_hdr_parse(str.l, str.s); + h->l_text = str.l; h->text = str.s; + return h; + } +} + +int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) +{ + if (fp->is_bin) { + bam_hdr_write(fp->fp.bgzf, h); + } else if (fp->is_cram) { + cram_fd *fd = fp->fp.cram; + if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1; + if (fp->fn_aux) + cram_load_reference(fd, fp->fn_aux); + if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; + } else { + char *p; + hputs(h->text, fp->fp.hfile); + p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!! + if (p == 0) { + int i; + for (i = 0; i < h->n_targets; ++i) { + fp->line.l = 0; + kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line); + kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line); + if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; + } + } + if ( hflush(fp->fp.hfile) != 0 ) return -1; + } + return 0; +} + +/********************** + *** SAM record I/O *** + **********************/ + +int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) +{ +#define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0 +#define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t' +#define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l) +#define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0) +#define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__) + + uint8_t *t; + char *p = s->s, *q; + int i; + kstring_t str; + bam1_core_t *c = &b->core; + + str.l = b->l_data = 0; + str.s = (char*)b->data; str.m = b->m_data; + memset(c, 0, 32); + if (h->cigar_tab == 0) { + h->cigar_tab = (int8_t*) malloc(128); + for (i = 0; i < 128; ++i) + h->cigar_tab[i] = -1; + for (i = 0; BAM_CIGAR_STR[i]; ++i) + h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i; + } + // qname + q = _read_token(p); + kputsn_(q, p - q, &str); + c->l_qname = p - q; + // flag + c->flag = strtol(p, &p, 0); + if (*p++ != '\t') goto err_ret; // malformated flag + // chr + q = _read_token(p); + if (strcmp(q, "*")) { + _parse_err(h->n_targets == 0, "missing SAM header"); + c->tid = bam_name2id(h, q); + _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped"); + } else c->tid = -1; + // pos + c->pos = strtol(p, &p, 10) - 1; + if (*p++ != '\t') goto err_ret; + if (c->pos < 0 && c->tid >= 0) { + _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); + c->tid = -1; + } + if (c->tid < 0) c->flag |= BAM_FUNMAP; + // mapq + c->qual = strtol(p, &p, 10); + if (*p++ != '\t') goto err_ret; + // cigar + if (*p != '*') { + uint32_t *cigar; + size_t n_cigar = 0; + for (q = p; *p && *p != '\t'; ++p) + if (!isdigit(*p)) ++n_cigar; + if (*p++ != '\t') goto err_ret; + _parse_err(n_cigar >= 65536, "too many CIGAR operations"); + c->n_cigar = n_cigar; + _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2); + for (i = 0; i < c->n_cigar; ++i, ++q) { + int op; + cigar[i] = strtol(q, &q, 10)<= 128? -1 : h->cigar_tab[(int)*q]; + _parse_err(op < 0, "unrecognized CIGAR operator"); + cigar[i] |= op; + } + i = bam_cigar2rlen(c->n_cigar, cigar); + } else { + _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); + c->flag |= BAM_FUNMAP; + q = _read_token(p); + i = 1; + } + c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5); + // mate chr + q = _read_token(p); + if (strcmp(q, "=") == 0) c->mtid = c->tid; + else if (strcmp(q, "*") == 0) c->mtid = -1; + else c->mtid = bam_name2id(h, q); + // mpos + c->mpos = strtol(p, &p, 10) - 1; + if (*p++ != '\t') goto err_ret; + if (c->mpos < 0 && c->mtid >= 0) { + _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); + c->mtid = -1; + } + // tlen + c->isize = strtol(p, &p, 10); + if (*p++ != '\t') goto err_ret; + // seq + q = _read_token(p); + if (strcmp(q, "*")) { + c->l_qseq = p - q - 1; + i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname)); + _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length"); + i = (c->l_qseq + 1) >> 1; + _get_mem(uint8_t, &t, &str, i); + memset(t, 0, i); + for (i = 0; i < c->l_qseq; ++i) + t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2); + } else c->l_qseq = 0; + // qual + q = _read_token_aux(p); + _get_mem(uint8_t, &t, &str, c->l_qseq); + if (strcmp(q, "*")) { + _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length"); + for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33; + } else memset(t, 0xff, c->l_qseq); + // aux + // Note that (like the bam1_core_t fields) this aux data in b->data is + // stored in host endianness; so there is no byte swapping needed here. + while (p < s->s + s->l) { + uint8_t type; + q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays + _parse_err(p - q - 1 < 6, "incomplete aux field"); + kputsn_(q, 2, &str); + q += 3; type = *q++; ++q; // q points to value + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { + kputc_('A', &str); + kputc_(*q, &str); + } else if (type == 'i' || type == 'I') { + long x; + x = strtol(q, &q, 10); + if (x < 0) { + if (x >= INT8_MIN) { + kputc_('c', &str); kputc_(x, &str); + } else if (x >= INT16_MIN) { + int16_t y = x; + kputc_('s', &str); kputsn_((char*)&y, 2, &str); + } else { + int32_t y = x; + kputc_('i', &str); kputsn_(&y, 4, &str); + } + } else { + if (x <= UINT8_MAX) { + kputc_('C', &str); kputc_(x, &str); + } else if (x <= UINT16_MAX) { + uint16_t y = x; + kputc_('S', &str); kputsn_(&y, 2, &str); + } else { + uint32_t y = x; + kputc_('I', &str); kputsn_(&y, 4, &str); + } + } + } else if (type == 'f') { + float x; + x = strtod(q, &q); + kputc_('f', &str); kputsn_(&x, 4, &str); + } else if (type == 'd') { + double x; + x = strtod(q, &q); + kputc_('d', &str); kputsn_(&x, 8, &str); + } else if (type == 'Z' || type == 'H') { + kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL + } else if (type == 'B') { + int32_t n; + char *r; + _parse_err(p - q - 1 < 3, "incomplete B-typed aux field"); + type = *q++; // q points to the first ',' following the typing byte + for (r = q, n = 0; *r; ++r) + if (*r == ',') ++n; + kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str); + // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_() + if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); } + else if (type == 'C') while (q + 1 < p) { uint8_t x = (unsigned long)(strtol(q + 1, &q, 0)); kputc_(x, &str); } + else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); } + else if (type == 'S') while (q + 1 < p) { uint16_t x = (unsigned long)(strtol(q + 1, &q, 0)); kputsn_(&x, 2, &str); } + else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); } + else if (type == 'I') while (q + 1 < p) { uint32_t x = (unsigned long)(strtol(q + 1, &q, 0)); kputsn_(&x, 4, &str); } + else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); } + else _parse_err(1, "unrecognized type"); + } else _parse_err(1, "unrecognized type"); + } + b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + return 0; + +#undef _parse_warn +#undef _parse_err +#undef _get_mem +#undef _read_token_aux +#undef _read_token +err_ret: + b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + return -2; +} + +int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) +{ + if (fp->is_bin) { + int r = bam_read1(fp->fp.bgzf, b); + if (r >= 0) { + if (b->core.tid >= h->n_targets || b->core.tid < -1 || + b->core.mtid >= h->n_targets || b->core.mtid < -1) + return -3; + } + return r; + } else if (fp->is_cram) { + return cram_get_bam_seq(fp->fp.cram, &b); + } else { + int ret; +err_recover: + if (fp->line.l == 0) { + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); + if (ret < 0) return -1; + } + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + if (ret < 0) { + if (hts_verbose >= 1) + fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno); + if (h->ignore_sam_err) goto err_recover; + } + return ret; + } +} + +int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) +{ + int i; + uint8_t *s; + const bam1_core_t *c = &b->core; + + str->l = 0; + kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name + kputw(c->flag, str); kputc('\t', str); // flag + if (c->tid >= 0) { // chr + kputs(h->target_name[c->tid] , str); + kputc('\t', str); + } else kputsn("*\t", 2, str); + kputw(c->pos + 1, str); kputc('\t', str); // pos + kputw(c->qual, str); kputc('\t', str); // qual + if (c->n_cigar) { // cigar + uint32_t *cigar = bam_get_cigar(b); + for (i = 0; i < c->n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + } else kputc('*', str); + kputc('\t', str); + if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr + else if (c->mtid == c->tid) kputsn("=\t", 2, str); + else { + kputs(h->target_name[c->mtid], str); + kputc('\t', str); + } + kputw(c->mpos + 1, str); kputc('\t', str); // mate pos + kputw(c->isize, str); kputc('\t', str); // template len + if (c->l_qseq) { // seq and qual + uint8_t *s = bam_get_seq(b); + for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str); + kputc('\t', str); + s = bam_get_qual(b); + if (s[0] == 0xff) kputc('*', str); + else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str); + } else kputsn("*\t*", 3, str); + s = bam_get_aux(b); // aux + while (s+4 <= b->data + b->l_data) { + uint8_t type, key[2]; + key[0] = s[0]; key[1] = s[1]; + s += 2; type = *s++; + kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str); + if (type == 'A') { + kputsn("A:", 2, str); + kputc(*s, str); + ++s; + } else if (type == 'C') { + kputsn("i:", 2, str); + kputw(*s, str); + ++s; + } else if (type == 'c') { + kputsn("i:", 2, str); + kputw(*(int8_t*)s, str); + ++s; + } else if (type == 'S') { + if (s+2 <= b->data + b->l_data) { + kputsn("i:", 2, str); + kputw(*(uint16_t*)s, str); + s += 2; + } else return -1; + } else if (type == 's') { + if (s+2 <= b->data + b->l_data) { + kputsn("i:", 2, str); + kputw(*(int16_t*)s, str); + s += 2; + } else return -1; + } else if (type == 'I') { + if (s+4 <= b->data + b->l_data) { + kputsn("i:", 2, str); + kputuw(*(uint32_t*)s, str); + s += 4; + } else return -1; + } else if (type == 'i') { + if (s+4 <= b->data + b->l_data) { + kputsn("i:", 2, str); + kputw(*(int32_t*)s, str); + s += 4; + } else return -1; + } else if (type == 'f') { + if (s+4 <= b->data + b->l_data) { + ksprintf(str, "f:%g", *(float*)s); + s += 4; + } else return -1; + + } else if (type == 'd') { + if (s+8 <= b->data + b->l_data) { + ksprintf(str, "d:%g", *(double*)s); + s += 8; + } else return -1; + } else if (type == 'Z' || type == 'H') { + kputc(type, str); kputc(':', str); + while (s < b->data + b->l_data && *s) kputc(*s++, str); + if (s >= b->data + b->l_data) + return -1; + ++s; + } else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + if (s + n >= b->data + b->l_data) + return -1; + kputsn("B:", 2, str); kputc(sub_type, str); // write the typing + for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if" + kputc(',', str); + if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; } + else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; } + } + } + } + return str->l; +} + +int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) +{ + if (fp->is_bin) { + return bam_write1(fp->fp.bgzf, b); + } else if (fp->is_cram) { + return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); + } else { + if (sam_format1(h, b, &fp->line) < 0) return -1; + kputc('\n', &fp->line); + if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; + return fp->line.l; + } +} + +/************************ + *** Auxiliary fields *** + ************************/ + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->l_data; + b->l_data += 3 + len; + if (b->m_data < b->l_data) { + b->m_data = b->l_data; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +static inline uint8_t *skip_aux(uint8_t *s) +{ + int size = aux_type2size(*s); ++s; // skip type + uint32_t n; + switch (size) { + case 'Z': + case 'H': + while (*s) ++s; + return s + 1; + case 'B': + size = aux_type2size(*s); ++s; + memcpy(&n, s, 4); s += 4; + return s + size * n; + case 0: + abort(); + break; + default: + return s + size; + } +} + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam_get_aux(b); + while (s < b->data + b->l_data) { + int x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + s = skip_aux(s); + } + return 0; +} +// s MUST BE returned by bam_aux_get() +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + uint8_t *p, *aux; + int l_aux = bam_get_l_aux(b); + aux = bam_get_aux(b); + p = s - 2; + s = skip_aux(s); + memmove(p, s, l_aux - (s - aux)); + b->l_data -= s - p; + return 0; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +double bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (type == 'd') return *(double*)s; + else if (type == 'f') return *(float*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} + +int sam_open_mode(char *mode, const char *fn, const char *format) +{ + // TODO Parse "bam5" etc for compression level + if (format == NULL) { + // Try to pick a format based on the filename extension + const char *ext = fn? strrchr(fn, '.') : NULL; + if (ext == NULL || strchr(ext, '/')) return -1; + return sam_open_mode(mode, fn, ext+1); + } + else if (strcmp(format, "bam") == 0) strcpy(mode, "b"); + else if (strcmp(format, "cram") == 0) strcpy(mode, "c"); + else if (strcmp(format, "sam") == 0) strcpy(mode, ""); + else return -1; + + return 0; +} + +#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) +int bam_str2flag(const char *str) +{ + char *end, *beg = (char*) str; + long int flag = strtol(str, &end, 0); + if ( end!=str ) return flag; // the conversion was successful + flag = 0; + while ( *str ) + { + end = beg; + while ( *end && *end!=',' ) end++; + if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; + else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; + else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; + else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; + else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; + else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; + else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; + else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; + else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; + else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; + else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; + else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; + else return -1; + if ( !*end ) break; + beg = end + 1; + } + return flag; +} + +char *bam_flag2str(int flag) +{ + kstring_t str = {0,0,0}; + if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); + if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); + if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); + if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); + if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); + if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); + if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); + if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); + if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); + if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); + if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); + if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); + if ( str.l == 0 ) kputsn("", 0, &str); + return str.s; +} + + +/************************** + *** Pileup and Mpileup *** + **************************/ + +#if !defined(BAM_NO_PILEUP) + +#include + +/******************* + *** Memory pool *** + *******************/ + +typedef struct { + int k, x, y, end; +} cstate_t; + +static cstate_t g_cstate_null = { -1, 0, 0, 0 }; + +typedef struct __linkbuf_t { + bam1_t b; + int32_t beg, end; + cstate_t s; + struct __linkbuf_t *next; +} lbnode_t; + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +static mempool_t *mp_init(void) +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/********************** + *** CIGAR resolver *** + **********************/ + +/* s->k: the index of the CIGAR operator that has just been processed. + s->x: the reference coordinate of the start of s->k + s->y: the query coordiante of the start of s->k + */ +static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) +{ +#define _cop(c) ((c)&BAM_CIGAR_MASK) +#define _cln(c) ((c)>>BAM_CIGAR_SHIFT) + + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t *cigar = bam_get_cigar(b); + int k; + // determine the current CIGAR operation +// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); + if (s->k == -1) { // never processed + if (c->n_cigar == 1) { // just one operation, save a loop + if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; + } else { // find the first match or deletion + for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { + int op = _cop(cigar[k]); + int l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; + else if (op == BAM_CREF_SKIP) s->x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; + } + assert(k < c->n_cigar); + s->k = k; + } + } else { // the read has been processed before + int op, l = _cln(cigar[s->k]); + if (pos - s->x >= l) { // jump to the next operation + assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case + op = _cop(cigar[s->k+1]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; + s->x += l; + ++s->k; + } else { // find the next M/D/N/=/X + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; + s->x += l; + for (k = s->k + 1; k < c->n_cigar; ++k) { + op = _cop(cigar[k]), l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; + } + s->k = k; + } + assert(s->k < c->n_cigar); // otherwise a bug + } // else, do nothing + } + { // collect pileup information + int op, l; + op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); + p->is_del = p->indel = p->is_refskip = 0; + if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation + int op2 = _cop(cigar[s->k+1]); + int l2 = _cln(cigar[s->k+1]); + if (op2 == BAM_CDEL) p->indel = -(int)l2; + else if (op2 == BAM_CINS) p->indel = l2; + else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + int l3 = 0; + for (k = s->k + 2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) l3 += l2; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; + } + if (l3 > 0) p->indel = l3; + } + } + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + p->qpos = s->y + (pos - s->x); + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! + p->is_refskip = (op == BAM_CREF_SKIP); + } // cannot be other operations; otherwise a bug + p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); + } + return 1; +} + +/*********************** + *** Pileup iterator *** + ***********************/ + +// Dictionary of overlapping reads +KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) +typedef khash_t(olap_hash) olap_hash_t; + +struct __bam_plp_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + int32_t tid, pos, max_tid, max_pos; + int is_eof, max_plp, error, maxcnt; + uint64_t id; + bam_pileup1_t *plp; + // for the "auto" interface only + bam1_t *b; + bam_plp_auto_f func; + void *data; + olap_hash_t *overlaps; +}; + +bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) +{ + bam_plp_t iter; + iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t)); + iter->mp = mp_init(); + iter->head = iter->tail = mp_alloc(iter->mp); + iter->dummy = mp_alloc(iter->mp); + iter->max_tid = iter->max_pos = -1; + iter->maxcnt = 8000; + if (func) { + iter->func = func; + iter->data = data; + iter->b = bam_init1(); + } + return iter; +} + +void bam_plp_init_overlaps(bam_plp_t iter) +{ + iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads +} + +void bam_plp_destroy(bam_plp_t iter) +{ + if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); + mp_free(iter->mp, iter->dummy); + mp_free(iter->mp, iter->head); + if (iter->mp->cnt != 0) + fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); + mp_destroy(iter->mp); + if (iter->b) bam_destroy1(iter->b); + free(iter->plp); + free(iter); +} + + +//--------------------------------- +//--- Tweak overlapping reads +//--------------------------------- + +/** + * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index + * cigar_iref2iseq_next() - get the next CMATCH base + * @cigar: pointer to current cigar block (rw) + * @cigar_max: pointer just beyond the last cigar block + * @icig: position within the current cigar block (rw) + * @iseq: position in the sequence (rw) + * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) + * + * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered. + */ +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +{ + int pos = *iref; + if ( pos < 0 ) return -1; + *icig = 0; + *iseq = 0; + *iref = 0; + while ( *cigar> BAM_CIGAR_SHIFT; + + if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } + if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; } + if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) + { + pos -= ncig; + if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } + (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; + continue; + } + if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } + if ( cig==BAM_CDEL ) + { + pos -= ncig; + if ( pos<0 ) pos = 0; + (*cigar)++; *icig = 0; *iref += ncig; + continue; + } + fprintf(stderr,"todo: cigar %d\n", cig); + assert(0); + } + *iseq = -1; + return -1; +} +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +{ + while ( *cigar < cigar_max ) + { + int cig = (**cigar) & BAM_CIGAR_MASK; + int ncig = (**cigar) >> BAM_CIGAR_SHIFT; + + if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) + { + if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; } + (*iseq)++; (*icig)++; (*iref)++; + return BAM_CMATCH; + } + if ( cig==BAM_CDEL ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; } + if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } + if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } + if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; } + fprintf(stderr,"todo: cigar %d\n", cig); + assert(0); + } + *iseq = -1; + *iref = -1; + return -1; +} + +static void tweak_overlap_quality(bam1_t *a, bam1_t *b) +{ + uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; + uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + int a_icig = 0, a_iseq = 0; + int b_icig = 0, b_iseq = 0; + uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); + uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); + + int iref = b->core.pos; + int a_iref = iref - a->core.pos; + int b_iref = iref - b->core.pos; + int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) return; // no overlap + int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); + if ( b_ret<0 ) return; // no overlap + + #if DBG + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); + #endif + + while ( 1 ) + { + // Increment reference position + while ( a_iref>=0 && a_iref < iref - a->core.pos ) + a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) break; // done + if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; + + while ( b_iref>=0 && b_iref < iref - b->core.pos ) + b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); + if ( b_ret<0 ) break; // done + if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; + + iref++; + if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) + { + #if DBG + fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]); + #endif + // we are very confident about this base + int qual = a_qual[a_iseq] + b_qual[b_iseq]; + a_qual[a_iseq] = qual>200 ? 200 : qual; + b_qual[b_iseq] = 0; + } + else + { + if ( a_qual[a_iseq] >= b_qual[b_iseq] ) + { + #if DBG + fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)])); + #endif + a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch + b_qual[b_iseq] = 0; + } + else + { + #if DBG + fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]); + #endif + b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; + a_qual[a_iseq] = 0; + } + } + } + #if DBG + fprintf(stderr,"\n"); + #endif +} + +// Fix overlapping reads. Simple soft-clipping did not give good results. +// Lowering qualities of unwanted bases is more selective and works better. +// +static void overlap_push(bam_plp_t iter, lbnode_t *node) +{ + if ( !iter->overlaps ) return; + + // mapped mates and paired reads only + if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return; + + // no overlap possible, unless some wild cigar + if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return; + + khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); + if ( kitr==kh_end(iter->overlaps) ) + { + int ret; + kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); + kh_value(iter->overlaps, kitr) = node; + } + else + { + lbnode_t *a = kh_value(iter->overlaps, kitr); + tweak_overlap_quality(&a->b, &node->b); + kh_del(olap_hash, iter->overlaps, kitr); + assert(a->end-1 == a->s.end); + a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b)); + a->s.end = a->end - 1; + } +} + +static void overlap_remove(bam_plp_t iter, const bam1_t *b) +{ + if ( !iter->overlaps ) return; + + khiter_t kitr; + if ( b ) + { + kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); + if ( kitr!=kh_end(iter->overlaps) ) + kh_del(olap_hash, iter->overlaps, kitr); + } + else + { + // remove all + for (kitr = kh_begin(iter->overlaps); kitroverlaps); kitr++) + if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); + } +} + + + +// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns +// pointer to the piled records if next position is ready or NULL if there is not enough records in the +// buffer yet (the current position is still the maximum position across all buffered reads). +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + if (iter->error) { *_n_plp = -1; return 0; } + *_n_plp = 0; + if (iter->is_eof && iter->head->next == 0) return 0; + while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { + int n_plp = 0; + lbnode_t *p, *q; + // write iter->plp at iter->pos + iter->dummy->next = iter->head; + for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove + overlap_remove(iter, &p->b); + q->next = p->next; mp_free(iter->mp, p); p = q; + } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup + if (n_plp == iter->max_plp) { // then double the capacity + iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; + iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); + } + iter->plp[n_plp].b = &p->b; + if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... + } + } + iter->head = iter->dummy->next; // dummy->next may be changed + *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; + // update iter->tid and iter->pos + if (iter->head->next) { + if (iter->tid > iter->head->b.core.tid) { + fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); + iter->error = 1; + *_n_plp = -1; + return 0; + } + } + if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence + iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference + } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid + iter->pos = iter->head->beg; // jump to the next position + } else ++iter->pos; // scan contiguously + // return + if (n_plp) return iter->plp; + if (iter->is_eof && iter->head->next == 0) break; + } + return 0; +} + +int bam_plp_push(bam_plp_t iter, const bam1_t *b) +{ + if (iter->error) return -1; + if (b) { + if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } + // Skip only unmapped reads here, any additional filtering must be done in iter->func + if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } + if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) + { + overlap_remove(iter, b); + return 0; + } + bam_copy1(&iter->tail->b, b); + overlap_push(iter, iter->tail); +#ifndef BAM_NO_ID + iter->tail->b.id = iter->id++; +#endif + iter->tail->beg = b->core.pos; + iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t + if (b->core.tid < iter->max_tid) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); + iter->error = 1; + return -1; + } + if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); + iter->error = 1; + return -1; + } + iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; + if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { + iter->tail->next = mp_alloc(iter->mp); + iter->tail = iter->tail->next; + } + } else iter->is_eof = 1; + return 0; +} + +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + const bam_pileup1_t *plp; + if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + else { // no pileup line can be obtained; read alignments + *_n_plp = 0; + if (iter->is_eof) return 0; + int ret; + while ( (ret=iter->func(iter->data, iter->b)) >= 0) { + if (bam_plp_push(iter, iter->b) < 0) { + *_n_plp = -1; + return 0; + } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + // otherwise no pileup line can be returned; read the next alignment. + } + if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } + bam_plp_push(iter, 0); + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + return 0; + } +} + +void bam_plp_reset(bam_plp_t iter) +{ + lbnode_t *p, *q; + iter->max_tid = iter->max_pos = -1; + iter->tid = iter->pos = 0; + iter->is_eof = 0; + for (p = iter->head; p->next;) { + overlap_remove(iter, NULL); + q = p->next; + mp_free(iter->mp, p); + p = q; + } + iter->head = iter->tail; +} + +void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) +{ + iter->maxcnt = maxcnt; +} + +/************************ + *** Mpileup iterator *** + ************************/ + +struct __bam_mplp_t { + int n; + uint64_t min, *pos; + bam_plp_t *iter; + int *n_plp; + const bam_pileup1_t **plp; +}; + +bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) +{ + int i; + bam_mplp_t iter; + iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); + iter->pos = (uint64_t*)calloc(n, 8); + iter->n_plp = (int*)calloc(n, sizeof(int)); + iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); + iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); + iter->n = n; + iter->min = (uint64_t)-1; + for (i = 0; i < n; ++i) { + iter->iter[i] = bam_plp_init(func, data[i]); + iter->pos[i] = iter->min; + } + return iter; +} + +void bam_mplp_init_overlaps(bam_mplp_t iter) +{ + int i; + for (i = 0; i < iter->n; ++i) + bam_plp_init_overlaps(iter->iter[i]); +} + +void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) +{ + int i; + for (i = 0; i < iter->n; ++i) + iter->iter[i]->maxcnt = maxcnt; +} + +void bam_mplp_destroy(bam_mplp_t iter) +{ + int i; + for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); + free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter); +} + +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + int i, ret = 0; + uint64_t new_min = (uint64_t)-1; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + int tid, pos; + iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + if ( iter->iter[i]->error ) return -1; + iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0; + } + if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; + } + iter->min = new_min; + if (new_min == (uint64_t)-1) return 0; + *_tid = new_min>>32; *_pos = (uint32_t)new_min; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" + n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; + ++ret; + } else n_plp[i] = 0, plp[i] = 0; + } + return ret; +} + +#endif // ~!defined(BAM_NO_PILEUP) diff --git a/star-sys/STAR/source/htslib/synced_bcf_reader.c b/star-sys/STAR/source/htslib/synced_bcf_reader.c new file mode 100644 index 0000000..5fc9594 --- /dev/null +++ b/star-sys/STAR/source/htslib/synced_bcf_reader.c @@ -0,0 +1,1183 @@ +#include +#include +#include +#include +#include +#include +#include +#include "htslib/synced_bcf_reader.h" +#include "htslib/kseq.h" +#include "htslib/khash_str2int.h" + +#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi + +typedef struct +{ + uint32_t start, end; +} +region1_t; + +typedef struct _region_t +{ + region1_t *regs; + int nregs, mregs, creg; +} +region_t; + +static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end); +static bcf_sr_regions_t *_regions_init_string(const char *str); +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); + +static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) +{ + kstring_t str = {0,0,0}; + const char *tmp = filters, *prev = filters; + int nout = 0, *out = NULL; + while ( 1 ) + { + if ( *tmp==',' || !*tmp ) + { + out = (int*) realloc(out, sizeof(int)); + if ( tmp-prev==1 && *prev=='.' ) + out[nout] = -1; + else + { + str.l = 0; + kputsn(prev, tmp-prev, &str); + out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s); + } + nout++; + if ( !*tmp ) break; + prev = tmp+1; + } + tmp++; + } + if ( str.m ) free(str.s); + *nfilters = nout; + return out; +} + +int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) +{ + assert( !readers->regions ); + if ( readers->nreaders ) + { + fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__); + return -1; + } + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + if ( !readers->regions ) return -1; + readers->explicit_regs = 1; + readers->require_index = 1; + return 0; +} +int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles) +{ + assert( !readers->targets ); + readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); + if ( !readers->targets ) return -1; + readers->targets_als = alleles; + return 0; +} + +int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) +{ + files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); + files->has_line[files->nreaders] = 0; + files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); + bcf_sr_t *reader = &files->readers[files->nreaders++]; + memset(reader,0,sizeof(bcf_sr_t)); + + reader->file = hts_open(fname, "r"); + if ( !reader->file ) return 0; + + reader->type = reader->file->is_bin? FT_BCF : FT_VCF; + if (reader->file->is_compressed) reader->type |= FT_GZ; + + if ( files->require_index ) + { + if ( reader->type==FT_VCF_GZ ) + { + reader->tbx_idx = tbx_index_load(fname); + if ( !reader->tbx_idx ) + { + fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); + return 0; + } + + reader->header = bcf_hdr_read(reader->file); + } + else if ( reader->type==FT_BCF_GZ ) + { + reader->header = bcf_hdr_read(reader->file); + + reader->bcf_idx = bcf_index_load(fname); + if ( !reader->bcf_idx ) + { + fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); + return 0; // not indexed..? + } + } + else + { + fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname); + return 0; + } + } + else + { + if ( reader->type & FT_BCF ) + { + reader->header = bcf_hdr_read(reader->file); + } + else if ( reader->type & FT_VCF ) + { + reader->header = bcf_hdr_read(reader->file); + } + else + { + fprintf(stderr,"File type not recognised: %s\n", fname); + return 0; + } + files->streaming = 1; + } + if ( files->streaming && files->nreaders>1 ) + { + fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); + return 0; + } + if ( files->streaming && files->regions ) + { + fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); + return 0; + } + if ( !reader->header ) return 0; + + reader->fname = fname; + if ( files->apply_filters ) + reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); + + // Update list of chromosomes + if ( !files->explicit_regs && !files->streaming ) + { + int n,i; + const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); + for (i=0; iregions ) + files->regions = _regions_init_string(names[i]); + else + _regions_add(files->regions, names[i], -1, -1); + } + free(names); + } + + return 1; +} + +bcf_srs_t *bcf_sr_init(void) +{ + bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); + return files; +} + +static void bcf_sr_destroy1(bcf_sr_t *reader) +{ + if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); + if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); + bcf_hdr_destroy(reader->header); + hts_close(reader->file); + if ( reader->itr ) tbx_itr_destroy(reader->itr); + int j; + for (j=0; jmbuffer; j++) + bcf_destroy1(reader->buffer[j]); + free(reader->buffer); + free(reader->samples); + free(reader->filter_ids); +} +void bcf_sr_destroy(bcf_srs_t *files) +{ + int i; + for (i=0; inreaders; i++) + bcf_sr_destroy1(&files->readers[i]); + free(files->has_line); + free(files->readers); + for (i=0; in_smpl; i++) free(files->samples[i]); + free(files->samples); + if (files->targets) bcf_sr_regions_destroy(files->targets); + if (files->regions) bcf_sr_regions_destroy(files->regions); + if ( files->tmps.m ) free(files->tmps.s); + free(files); +} + +void bcf_sr_remove_reader(bcf_srs_t *files, int i) +{ + assert( !files->samples ); // not ready for this yet + bcf_sr_destroy1(&files->readers[i]); + if ( i+1 < files->nreaders ) + { + memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t)); + memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int)); + } + files->nreaders--; +} + + +/* + Removes duplicate records from the buffer. The meaning of "duplicate" is + controlled by the $collapse variable, which can cause that from multiple + lines only the first is considered and the rest is ignored. + The removal is done by setting the redundant lines' positions to -1 and + moving these lines at the end of the buffer. + */ +static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader) +{ + int irec,jrec, has_snp=0, has_indel=0, has_any=0; + for (irec=1; irec<=reader->nbuffer; irec++) + { + bcf1_t *line = reader->buffer[irec]; + if ( line->pos != reader->buffer[1]->pos ) break; + if ( files->collapse&COLLAPSE_ANY ) + { + if ( !has_any ) has_any = 1; + else line->pos = -1; + } + int line_type = bcf_get_variant_types(line); + if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) ) + { + if ( !has_snp ) has_snp = 1; + else line->pos = -1; + } + if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL ) + { + if ( !has_indel ) has_indel = 1; + else line->pos = -1; + } + } + bcf1_t *tmp; + irec = jrec = 1; + while ( irec<=reader->nbuffer && jrec<=reader->nbuffer ) + { + if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; } + if ( jrec<=irec ) jrec = irec+1; + while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++; + if ( jrec<=reader->nbuffer ) + { + tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp; + } + } + reader->nbuffer = irec - 1; +} + +void debug_buffer(FILE *fp, bcf_sr_t *reader) +{ + int j; + for (j=0; j<=reader->nbuffer; j++) + { + bcf1_t *line = reader->buffer[j]; + fprintf(fp,"%s%s\t%s:%d\t%s ", reader->fname,j==0?"*":"",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + int k; + for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); + fprintf(fp,"\n"); + } +} + +void debug_buffers(FILE *fp, bcf_srs_t *files) +{ + int i; + for (i=0; inreaders; i++) + { + fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname); + debug_buffer(fp, &files->readers[i]); + } + fprintf(fp,"\n"); +} + +static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) +{ + int i, j; + if ( !line->d.n_flt ) + { + for (j=0; jnfilter_ids; j++) + if ( reader->filter_ids[j]<0 ) return 1; + return 0; + } + for (i=0; id.n_flt; i++) + { + for (j=0; jnfilter_ids; j++) + if ( line->d.flt[i]==reader->filter_ids[j] ) return 1; + } + return 0; +} + +static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) +{ + if ( end>=MAX_CSI_COOR ) + { + fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1); + exit(1); + } + if ( reader->itr ) + { + hts_itr_destroy(reader->itr); + reader->itr = NULL; + } + reader->nbuffer = 0; + if ( reader->tbx_idx ) + { + int tid = tbx_name2id(reader->tbx_idx, seq); + if ( tid==-1 ) return -1; // the sequence not present in this file + reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1); + } + else + { + int tid = bcf_hdr_name2id(reader->header, seq); + if ( tid==-1 ) return -1; // the sequence not present in this file + reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1); + } + assert(reader->itr); + return 0; +} + +/* + * _readers_next_region() - jumps to next region if necessary + * Returns 0 on success or -1 when there are no more regions left + */ +static int _readers_next_region(bcf_srs_t *files) +{ + // Need to open new chromosome? Check number of lines in all readers' buffers + int i, eos = 0; + for (i=0; inreaders; i++) + if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; + + if ( eos!=files->nreaders ) + { + // Some of the readers still has buffered lines + return 0; + } + + // No lines in the buffer, need to open new region or quit + if ( bcf_sr_regions_next(files->regions)<0 ) return -1; + + for (i=0; inreaders; i++) + _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); + + return 0; +} + +/* + * _reader_fill_buffer() - buffers all records with the same coordinate + */ +static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) +{ + // Return if the buffer is full: the coordinate of the last buffered record differs + if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; + + // No iterator (sequence not present in this file) and not streaming + if ( !reader->itr && !files->streaming ) return; + + // Fill the buffer with records starting at the same position + int i, ret = 0; + while (1) + { + if ( reader->nbuffer+1 >= reader->mbuffer ) + { + // Increase buffer size + reader->mbuffer += 8; + reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); + for (i=8; i>0; i--) // initialize + { + reader->buffer[reader->mbuffer-i] = bcf_init1(); + reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; + reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 + } + } + if ( files->streaming ) + { + if ( reader->type & FT_VCF ) + { + if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines + int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); + if ( ret<0 ) break; + } + else if ( reader->type & FT_BCF ) + { + if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines + } + else + { + fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); + exit(1); + } + } + else if ( reader->tbx_idx ) + { + if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines + vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); + } + else + { + if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines + bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); + } + + // apply filter + if ( !reader->nfilter_ids ) + bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); + else + { + bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); + if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; + } + reader->nbuffer++; + + if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full + } + if ( ret<0 ) + { + // done for this region + tbx_itr_destroy(reader->itr); + reader->itr = NULL; + } + if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) + collapse_buffer(files, reader); +} + +/* + * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position + */ +static void _reader_shift_buffer(bcf_sr_t *reader) +{ + int i; + for (i=2; i<=reader->nbuffer; i++) + if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; + if ( i<=reader->nbuffer ) + { + // A record with a different position follows, swap it. Because of the reader's logic, + // only one such line can be present. + bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; + reader->nbuffer = 1; + } + else + reader->nbuffer = 0; // no other line +} + +/* + * _reader_match_alleles() - from multiple buffered lines selects the one which + * corresponds best to the template line. The logic is controlled by COLLAPSE_* + * Returns 0 on success or -1 when no good matching line is found. + */ +static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl) +{ + int i, irec = -1; + + // if no template given, use the first available record + if ( !tmpl ) + irec = 1; + else + { + int tmpl_type = bcf_get_variant_types(tmpl); + for (i=1; i<=reader->nbuffer; i++) + { + bcf1_t *line = reader->buffer[i]; + if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader + + // Easiest case: matching by position only + if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; } + + int line_type = bcf_get_variant_types(line); + + // No matter what the alleles are, as long as they are both SNPs + if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; } + // ... or indels + if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; } + + // More thorough checking: REFs must match + if ( tmpl->rlen != line->rlen ) continue; // different length + if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match + + int ial,jal; + if ( files->collapse==COLLAPSE_NONE ) + { + // Exact match, all alleles must be identical + if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip + + int nmatch = 1; // REF has been already checked + for (ial=1; ialn_allele; ial++) + { + for (jal=1; jaln_allele; jal++) + if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } + } + if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match + continue; + } + + // COLLAPSE_SOME: at least some ALTs must match + for (ial=1; ialn_allele; ial++) + { + for (jal=1; jaln_allele; jal++) + if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; } + if ( irec>=1 ) break; + } + if ( irec>=1 ) break; + } + if ( irec==-1 ) return -1; // no matching line was found + } + + // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward + // and put the old bcf1_t record at the end. + bcf1_t *tmp = reader->buffer[0]; + reader->buffer[0] = reader->buffer[irec]; + for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i]; + reader->buffer[ reader->nbuffer ] = tmp; + reader->nbuffer--; + + return 0; +} + +int _reader_next_line(bcf_srs_t *files) +{ + int i, min_pos = INT_MAX; + + // Loop until next suitable line is found or all readers have finished + while ( 1 ) + { + // Get all readers ready for the next region. + if ( files->regions && _readers_next_region(files)<0 ) break; + + // Fill buffers + const char *chr = NULL; + for (i=0; inreaders; i++) + { + _reader_fill_buffer(files, &files->readers[i]); + + // Update the minimum coordinate + if ( !files->readers[i].nbuffer ) continue; + if ( min_pos > files->readers[i].buffer[1]->pos ) + { + min_pos = files->readers[i].buffer[1]->pos; + chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); + } + } + if ( min_pos==INT_MAX ) + { + if ( !files->regions ) break; + continue; + } + + // Skip this position if not present in targets + if ( files->targets ) + { + if ( bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos)<0 ) + { + // Remove all lines with this position from the buffer + for (i=0; inreaders; i++) + if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) + _reader_shift_buffer(&files->readers[i]); + min_pos = INT_MAX; + continue; + } + } + + break; // done: min_pos is set + } + + // There can be records with duplicate positions. Set the active line intelligently so that + // the alleles match. + int nret = 0; // number of readers sharing the position + bcf1_t *first = NULL; // record which will be used for allele matching + for (i=0; inreaders; i++) + { + files->has_line[i] = 0; + + // Skip readers with no records at this position + if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; + + // Until now buffer[0] of all reader was empty and the lines started at buffer[1]. + // Now lines which are ready to be output will be moved to buffer[0]. + if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue; + if ( !first ) first = files->readers[i].buffer[0]; + + nret++; + files->has_line[i] = 1; + } + return nret; +} + +int bcf_sr_next_line(bcf_srs_t *files) +{ + if ( !files->targets_als ) + return _reader_next_line(files); + + while (1) + { + int i, ret = _reader_next_line(files); + if ( !ret ) return ret; + + for (i=0; inreaders; i++) + if ( files->has_line[i] ) break; + + if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret; + + // Check if there are more duplicate lines in the buffers. If not, return this line as if it + // matched the targets, even if there is a type mismatch + for (i=0; inreaders; i++) + { + if ( !files->has_line[i] ) continue; + if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue; + break; + } + if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type + } +} + +static void bcf_sr_seek_start(bcf_srs_t *readers) +{ + bcf_sr_regions_t *reg = readers->regions; + int i; + for (i=0; inseqs; i++) + reg->regs[i].creg = -1; + reg->iseq = 0; +} + + +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos) +{ + if ( !seq && !pos ) + { + // seek to start + bcf_sr_seek_start(readers); + return 0; + } + + bcf_sr_regions_overlap(readers->regions, seq, pos, pos); + int i, nret = 0; + for (i=0; inreaders; i++) + { + nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); + } + return nret; +} + +int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) +{ + int i, j, nsmpl, free_smpl = 0; + char **smpl = NULL; + + void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; + if ( exclude || strcmp("-",fname) ) // "-" stands for all samples + { + smpl = hts_readlist(fname, is_file, &nsmpl); + if ( !smpl ) + { + fprintf(stderr,"Could not read the file: \"%s\"\n", fname); + return 0; + } + if ( exclude ) + { + for (i=0; ireaders[0].header->samples; // intersection of all samples + nsmpl = bcf_hdr_nsamples(files->readers[0].header); + } + + files->samples = NULL; + files->n_smpl = 0; + for (i=0; inreaders; j++) + { + if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break; + n_isec++; + } + if ( n_isec!=files->nreaders ) + { + fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname); + continue; + } + + files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*)); + files->samples[files->n_smpl++] = strdup(smpl[i]); + } + + if ( exclude ) khash_str2int_destroy(exclude); + if ( free_smpl ) + { + for (i=0; in_smpl ) + { + if ( files->nreaders>1 ) + fprintf(stderr,"No samples in common.\n"); + return 0; + } + for (i=0; inreaders; i++) + { + bcf_sr_t *reader = &files->readers[i]; + reader->samples = (int*) malloc(sizeof(int)*files->n_smpl); + reader->n_smpl = files->n_smpl; + for (j=0; jn_smpl; j++) + reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); + } + return 1; +} + +// Add a new region into a list sorted by start,end. On input the coordinates +// are 1-based, stored 0-based, inclusive. +static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end) +{ + if ( start==-1 && end==-1 ) + { + start = 0; end = MAX_CSI_COOR-1; + } + else + { + start--; end--; // store 0-based coordinates + } + + if ( !reg->seq_hash ) + reg->seq_hash = khash_str2int_init(); + + int iseq; + if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 ) + { + // the chromosome block does not exist + iseq = reg->nseqs++; + reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs); + reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs); + memset(®->regs[reg->nseqs-1],0,sizeof(region_t)); + reg->seq_names[iseq] = strdup(chr); + reg->regs[iseq].creg = -1; + khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq); + } + + region_t *creg = ®->regs[iseq]; + + // the regions may not be sorted on input: binary search + int i, min = 0, max = creg->nregs - 1; + while ( min<=max ) + { + i = (max+min)/2; + if ( start < creg->regs[i].start ) max = i - 1; + else if ( start > creg->regs[i].start ) min = i + 1; + else break; + } + if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end ) + { + // no such region, insert a new one just after max + hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); + if ( ++max < creg->nregs ) + memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t)); + creg->regs[max].start = start; + creg->regs[max].end = end; + creg->nregs++; + } +} + +// File name or a list of genomic locations. If file name, NULL is returned. +static bcf_sr_regions_t *_regions_init_string(const char *str) +{ + bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); + reg->start = reg->end = -1; + reg->prev_start = reg->prev_seq = -1; + + kstring_t tmp = {0,0,0}; + const char *sp = str, *ep = str; + int from, to; + while ( 1 ) + { + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + tmp.l = 0; + kputsn(sp,ep-sp,&tmp); + if ( *ep==':' ) + { + sp = ep+1; + from = strtol(sp,(char**)&ep,10); + if ( sp==ep ) + { + fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); + free(reg); free(tmp.s); return NULL; + } + if ( !*ep || *ep==',' ) + { + _regions_add(reg, tmp.s, from, from); + sp = ep; + continue; + } + if ( *ep!='-' ) + { + fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); + free(reg); free(tmp.s); return NULL; + } + ep++; + sp = ep; + to = strtol(sp,(char**)&ep,10); + if ( *ep && *ep!=',' ) + { + fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); + free(reg); free(tmp.s); return NULL; + } + if ( sp==ep ) to = MAX_CSI_COOR-1; + _regions_add(reg, tmp.s, from, to); + if ( !*ep ) break; + sp = ep; + } + else + { + if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); + if ( !*ep ) break; + sp = ++ep; + } + } + free(tmp.s); + return reg; +} + +// ichr,ifrom,ito are 0-based; +// returns -1 on error, 0 if the line is a comment line, 1 on success +static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to) +{ + *chr_end = NULL; + + if ( line[0]=='#' ) return 0; + + int k,l; // index of the start and end column of the tab-delimited file + if ( ifrom <= ito ) + k = ifrom, l = ito; + else + l = ifrom, k = ito; + + int i; + char *se = line, *ss = NULL; // start and end + char *tmp; + for (i=0; i<=k && *se; i++) + { + ss = i==0 ? se++ : ++se; + while (*se && *se!='\t') se++; + } + if ( i<=k ) return -1; + if ( k==l ) + { + *from = *to = strtol(ss, &tmp, 10); + if ( tmp==ss ) return -1; + } + else + { + if ( k==ifrom ) + *from = strtol(ss, &tmp, 10); + else + *to = strtol(ss, &tmp, 10); + if ( ss==tmp ) return -1; + + for (i=k; i0 ) ss = ++se; + while (*se && *se!='\t') se++; + } + if ( i<=ichr ) return -1; + *chr_end = se; + *chr = ss; + return 1; +} + +bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) +{ + bcf_sr_regions_t *reg; + if ( !is_file ) return _regions_init_string(regions); + + reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); + reg->start = reg->end = -1; + reg->prev_start = reg->prev_seq = -1; + + reg->file = hts_open(regions, "rb"); + if ( !reg->file ) + { + fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); + free(reg); + return NULL; + } + + reg->tbx = tbx_index_load(regions); + if ( !reg->tbx ) + { + int len = strlen(regions); + int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; + if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; + int ft_type = hts_file_type(regions); + if ( ft_type & FT_VCF ) ito = 1; + + // read the whole file, tabix index is not present + while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) + { + char *chr, *chr_end; + int from, to, ret; + ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); + if ( ret < 0 ) + { + if ( ito<0 ) + ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); + if ( ret<0 ) + { + fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); + hts_close(reg->file); reg->file = NULL; free(reg); + return NULL; + } + } + if ( !ret ) continue; + if ( is_bed ) from++; + *chr_end = 0; + _regions_add(reg, chr, from, to); + *chr_end = '\t'; + } + hts_close(reg->file); reg->file = NULL; + if ( !reg->nseqs ) { free(reg); return NULL; } + return reg; + } + + reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); + if ( !reg->seq_hash ) + reg->seq_hash = khash_str2int_init(); + int i; + for (i=0; inseqs; i++) + { + khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); + } + reg->fname = strdup(regions); + reg->is_bin = 1; + return reg; +} + +void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) +{ + int i; + free(reg->fname); + if ( reg->itr ) tbx_itr_destroy(reg->itr); + if ( reg->tbx ) tbx_destroy(reg->tbx); + if ( reg->file ) hts_close(reg->file); + if ( reg->als ) free(reg->als); + if ( reg->als_str.s ) free(reg->als_str.s); + free(reg->line.s); + if ( reg->regs ) + { + // free only in-memory names, tbx names are const + for (i=0; inseqs; i++) + { + free(reg->seq_names[i]); + free(reg->regs[i].regs); + } + } + free(reg->regs); + free(reg->seq_names); + khash_str2int_destroy(reg->seq_hash); + free(reg); +} + +int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) +{ + reg->iseq = reg->start = reg->end = -1; + if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions + + // using in-memory regions + if ( reg->regs ) return 0; + + // reading regions from tabix + if ( reg->itr ) tbx_itr_destroy(reg->itr); + reg->itr = tbx_itr_querys(reg->tbx, seq); + if ( reg->itr ) return 0; + + return -1; +} + +int bcf_sr_regions_next(bcf_sr_regions_t *reg) +{ + if ( reg->iseq<0 ) return -1; + reg->start = reg->end = -1; + reg->nals = 0; + + // using in-memory regions + if ( reg->regs ) + { + while ( reg->iseq < reg->nseqs ) + { + reg->regs[reg->iseq].creg++; + if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break; + reg->iseq++; + } + if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left + region1_t *creg = ®->regs[reg->iseq].regs[reg->regs[reg->iseq].creg]; + reg->start = creg->start; + reg->end = creg->end; + return 0; + } + + // reading from tabix + char *chr, *chr_end; + int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; + if ( reg->tbx ) + { + ichr = reg->tbx->conf.sc-1; + ifrom = reg->tbx->conf.bc-1; + ito = reg->tbx->conf.ec-1; + if ( ito<0 ) ito = ifrom; + is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0; + } + + int ret = 0; + while ( !ret ) + { + if ( reg->itr ) + { + // tabix index present, reading a chromosome block + ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line); + if ( ret<0 ) { reg->iseq = -1; return -1; } + } + else + { + if ( reg->is_bin ) + { + // Waited for seek which never came. Reopen in text mode and stream + // through the regions, otherwise hts_getline would fail + hts_close(reg->file); + reg->file = hts_open(reg->fname, "r"); + if ( !reg->file ) + { + fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname); + reg->file = NULL; + bcf_sr_regions_destroy(reg); + return -1; + } + reg->is_bin = 0; + } + + // tabix index absent, reading the whole file + ret = hts_getline(reg->file, KS_SEP_LINE, ®->line); + if ( ret<0 ) { reg->iseq = -1; return -1; } + } + ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); + if ( ret<0 ) + { + fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1); + return -1; + } + } + if ( is_bed ) from++; + + *chr_end = 0; + if ( khash_str2int_get(reg->seq_hash, chr, ®->iseq)<0 ) + { + fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s); + exit(1); + } + *chr_end = '\t'; + + reg->start = from - 1; + reg->end = to - 1; + return 0; +} + +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) +{ + int i = 0, max_len = 0; + if ( !reg->nals ) + { + char *ss = reg->line.s; + while ( inals = 1; + while ( *se && *se!='\t' ) + { + if ( *se==',' ) reg->nals++; + se++; + } + ks_resize(®->als_str, se-ss+1+reg->nals); + reg->als_str.l = 0; + hts_expand(char*,reg->nals,reg->mals,reg->als); + reg->nals = 0; + + se = ss; + while ( *(++se) ) + { + if ( *se=='\t' ) break; + if ( *se!=',' ) continue; + reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; + kputsn(ss,se-ss,®->als_str); + if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; + reg->als_str.l++; + reg->nals++; + ss = ++se; + } + reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; + kputsn(ss,se-ss,®->als_str); + if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; + reg->nals++; + reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types + } + int type = bcf_get_variant_types(rec); + if ( reg->als_type & VCF_INDEL ) + return type & VCF_INDEL ? 1 : 0; + return !(type & VCF_INDEL) ? 1 : 0; +} + +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) +{ + int iseq; + if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence + + if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek + { + // flush regions left on previous chromosome + if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) + bcf_sr_regions_flush(reg); + + bcf_sr_regions_seek(reg, seq); + reg->start = reg->end = -1; + } + if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome + reg->prev_seq = reg->iseq; + reg->prev_start = start; + + while ( iseq==reg->iseq && reg->end < start ) + { + if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left + if ( reg->iseq != iseq ) return -1; // does not overlap any regions + if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); + } + if ( reg->start <= end ) return 0; // region overlap + return -1; // no overlap +} + +void bcf_sr_regions_flush(bcf_sr_regions_t *reg) +{ + if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return; + while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data); + return; +} + diff --git a/star-sys/STAR/source/htslib/tabix.1 b/star-sys/STAR/source/htslib/tabix.1 new file mode 100644 index 0000000..1bd9533 --- /dev/null +++ b/star-sys/STAR/source/htslib/tabix.1 @@ -0,0 +1,132 @@ +.TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools" +.SH NAME +.PP +bgzip - Block compression/decompression utility +.PP +tabix - Generic indexer for TAB-delimited genome position files +.SH SYNOPSIS +.PP +.B bgzip +.RB [ \-cdhB ] +.RB [ \-b +.IR virtualOffset ] +.RB [ \-s +.IR size ] +.RI [ file ] +.PP +.B tabix +.RB [ \-0lf ] +.RB [ \-p +.R gff|bed|sam|vcf] +.RB [ \-s +.IR seqCol ] +.RB [ \-b +.IR begCol ] +.RB [ \-e +.IR endCol ] +.RB [ \-S +.IR lineSkip ] +.RB [ \-c +.IR metaChar ] +.I in.tab.bgz +.RI [ "region1 " [ "region2 " [ ... "]]]" + +.SH DESCRIPTION +.PP +Tabix indexes a TAB-delimited genome position file +.I in.tab.bgz +and creates an index file +.I in.tab.bgz.tbi +when +.I region +is absent from the command-line. The input data file must be position +sorted and compressed by +.B bgzip +which has a +.BR gzip (1) +like interface. After indexing, tabix is able to quickly retrieve data +lines overlapping +.I regions +specified in the format "chr:beginPos-endPos". Fast data retrieval also +works over network if URI is given as a file name and in this case the +index file will be downloaded if it is not present locally. + +.SH OPTIONS OF TABIX +.TP 10 +.BI "-p " STR +Input format for indexing. Valid values are: gff, bed, sam, vcf and +psltab. This option should not be applied together with any of +.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ; +it is not used for data retrieval because this setting is stored in +the index file. [gff] +.TP +.BI "-s " INT +Column of sequence name. Option +.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0 +are all stored in the index file and thus not used in data retrieval. [1] +.TP +.BI "-b " INT +Column of start chromosomal position. [4] +.TP +.BI "-e " INT +Column of end chromosomal position. The end column can be the same as the +start column. [5] +.TP +.BI "-S " INT +Skip first INT lines in the data file. [0] +.TP +.BI "-c " CHAR +Skip lines started with character CHAR. [#] +.TP +.B -0 +Specify that the position in the data file is 0-based (e.g. UCSC files) +rather than 1-based. +.TP +.B -h +Print the header/meta lines. +.TP +.B -B +The second argument is a BED file. When this option is in use, the input +file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless, +with this option, the format of the input must be specificed correctly on the command line. +.TP +.B -f +Force to overwrite the index file if it is present. +.TP +.B -l +List the sequence names stored in the index file. +.RE + +.SH EXAMPLE +(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; + +tabix -p gff sorted.gff.gz; + +tabix sorted.gff.gz chr1:10,000,000-20,000,000; + +.SH NOTES +It is straightforward to achieve overlap queries using the standard +B-tree index (with or without binning) implemented in all SQL databases, +or the R-tree index in PostgreSQL and Oracle. But there are still many +reasons to use tabix. Firstly, tabix directly works with a lot of widely +used TAB-delimited formats such as GFF/GTF and BED. We do not need to +design database schema or specialized binary formats. Data do not need +to be duplicated in different formats, either. Secondly, tabix works on +compressed data files while most SQL databases do not. The GenCode +annotation GTF can be compressed down to 4%. Thirdly, tabix is +fast. The same indexing algorithm is known to work efficiently for an +alignment with a few billion short reads. SQL databases probably cannot +easily handle data at this scale. Last but not the least, tabix supports +remote data retrieval. One can put the data file and the index at an FTP +or HTTP server, and other users or even web services will be able to get +a slice without downloading the entire file. + +.SH AUTHOR +.PP +Tabix was written by Heng Li. The BGZF library was originally +implemented by Bob Handsaker and modified by Heng Li for remote file +access and in-memory caching. + +.SH SEE ALSO +.PP +.BR samtools (1) diff --git a/star-sys/STAR/source/htslib/tabix.c b/star-sys/STAR/source/htslib/tabix.c new file mode 100644 index 0000000..2a30fed --- /dev/null +++ b/star-sys/STAR/source/htslib/tabix.c @@ -0,0 +1,374 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/tbx.h" +#include "htslib/sam.h" +#include "htslib/vcf.h" +#include "htslib/kseq.h" +#include "htslib/bgzf.h" +#include "htslib/hts.h" + +typedef struct +{ + int min_shift; +} +args_t; + +static void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + + +#define IS_GFF (1<<0) +#define IS_BED (1<<1) +#define IS_SAM (1<<2) +#define IS_VCF (1<<3) +#define IS_BCF (1<<4) +#define IS_BAM (1<<5) +#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF) + +int file_type(const char *fname) +{ + int l = strlen(fname); + int strcasecmp(const char *s1, const char *s2); + if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF; + else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED; + else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM; + else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF; + else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF; + else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM; + return 0; +} + +#define PRINT_HEADER 1 +#define HEADER_ONLY 2 +static int query_regions(char **argv, int argc, int mode) +{ + char *fname = argv[0]; + int i, ftype = file_type(fname); + + if ( ftype & IS_TXT || !ftype ) + { + htsFile *fp = hts_open(fname,"r"); + if ( !fp ) error("Could not read %s\n", fname); + tbx_t *tbx = tbx_index_load(fname); + if ( !tbx ) error("Could not load .tbi index of %s\n", fname); + kstring_t str = {0,0,0}; + if ( mode ) + { + while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) + { + if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; + puts(str.s); + } + } + if ( mode!=HEADER_ONLY ) + { + for (i=1; i= 0) puts(str.s); + tbx_itr_destroy(itr); + } + } + free(str.s); + if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); + tbx_destroy(tbx); + } + else if ( ftype==IS_BCF ) // output uncompressed VCF + { + htsFile *fp = hts_open(fname,"r"); + if ( !fp ) error("Could not read %s\n", fname); + htsFile *out = hts_open("-","w"); + if ( !out ) error("Could not open stdout\n", fname); + hts_idx_t *idx = bcf_index_load(fname); + if ( !idx ) error("Could not load .csi index of %s\n", fname); + bcf_hdr_t *hdr = bcf_hdr_read(fp); + if ( !hdr ) error("Could not read the header: %s\n", fname); + if ( mode ) + { + bcf_hdr_write(out,hdr); + } + if ( mode!=HEADER_ONLY ) + { + bcf1_t *rec = bcf_init(); + for (i=1; i=0 ) bcf_write(out,hdr,rec); + tbx_itr_destroy(itr); + } + bcf_destroy(rec); + } + if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); + if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); + bcf_hdr_destroy(hdr); + hts_idx_destroy(idx); + } + else if ( ftype==IS_BAM ) // todo: BAM + error("Please use \"samtools view\" for querying BAM files.\n"); + return 0; +} +static int query_chroms(char *fname) +{ + const char **seq; + int i, nseq, ftype = file_type(fname); + if ( ftype & IS_TXT || !ftype ) + { + tbx_t *tbx = tbx_index_load(fname); + if ( !tbx ) error("Could not load .tbi index of %s\n", fname); + seq = tbx_seqnames(tbx, &nseq); + for (i=0; iblock_length ) return -1; + + char *buffer = fp->uncompressed_block; + int skip_until = 0; + + // Skip the header: find out the position of the data block + if ( buffer[0]==conf->meta_char ) + { + skip_until = 1; + while (1) + { + if ( buffer[skip_until]=='\n' ) + { + skip_until++; + if ( skip_until>=fp->block_length ) + { + if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname); + skip_until = 0; + } + // The header has finished + if ( buffer[skip_until]!=conf->meta_char ) break; + } + skip_until++; + if ( skip_until>=fp->block_length ) + { + if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname); + skip_until = 0; + } + } + } + + // Output the new header + FILE *hdr = fopen(header,"r"); + if ( !hdr ) error("%s: %s", header,strerror(errno)); + int page_size = getpagesize(); + char *buf = valloc(page_size); + BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w"); + ssize_t nread; + while ( (nread=fread(buf,1,page_size-1,hdr))>0 ) + { + if ( nreaderrcode); + } + if ( fclose(hdr) ) error("close failed: %s\n", header); + + // Output all remainig data read with the header block + if ( fp->block_length - skip_until > 0 ) + { + if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); + } + if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); + + while (1) + { + nread = bgzf_raw_read(fp, buf, page_size); + if ( nread<=0 ) break; + + int count = bgzf_raw_write(bgzf_out, buf, nread); + if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); + } + if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); + if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); + } + else + error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header. + return 0; +} + +static int usage(void) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Version: %s\n", hts_version()); + fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -0, --zero-based coordinates are zero-based\n"); + fprintf(stderr, " -b, --begin INT column number for region start [4]\n"); + fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n"); + fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n"); + fprintf(stderr, " -f, --force overwrite existing index without asking\n"); + fprintf(stderr, " -h, --print-header print also the header lines\n"); + fprintf(stderr, " -H, --only-header print only the header lines\n"); + fprintf(stderr, " -l, --list-chroms list chromosome names\n"); + fprintf(stderr, " -m, --min-shift INT set the minimal interval size to 1<= 0) + { + switch (c) + { + case 'r': reheader = optarg; break; + case 'h': mode = PRINT_HEADER; break; + case 'H': mode = HEADER_ONLY; break; + case 'l': list_chroms = 1; break; + case '0': conf.preset |= TBX_UCSC; break; + case 'b': conf.bc = atoi(optarg); break; + case 'e': conf.ec = atoi(optarg); break; + case 'c': conf.meta_char = *optarg; break; + case 'f': is_force = 1; break; + case 'm': min_shift = atoi(optarg); break; + case 'p': + if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; + else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; + else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; + else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; + else error("The preset string not recognised: '%s'\n", optarg); + break; + case 's': conf.sc = atoi(optarg); break; + case 'S': conf.line_skip = atoi(optarg); break; + default: return usage(); + } + } + + if ( optind==argc ) return usage(); + + if ( list_chroms ) + return query_chroms(argv[optind]); + + if ( argc > optind+1 || mode==HEADER_ONLY ) + return query_regions(&argv[optind], argc-optind, mode); + + char *fname = argv[optind]; + int ftype = file_type(fname); + if ( !conf_ptr ) // no preset given + { + if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff; + else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed; + else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam; + else if ( ftype==IS_VCF ) conf_ptr = &tbx_conf_vcf; + else if ( ftype==IS_BCF ) + { + if ( min_shift <= 0 ) min_shift = 14; + } + else if ( ftype==IS_BAM ) + { + if ( min_shift <= 0 ) min_shift = 14; + } + } + if ( reheader ) + return reheader_file(fname, reheader, ftype, conf_ptr); + + if ( conf_ptr ) + conf = *conf_ptr; + + char *suffix = min_shift <= 0 ? ".tbi" : (ftype==IS_BAM ? ".bai" : ".csi"); + char *idx_fname = calloc(strlen(fname) + 5, 1); + strcat(strcpy(idx_fname, fname), suffix); + + struct stat stat_tbi, stat_file; + if ( !is_force && stat(idx_fname, &stat_tbi)==0 ) + { + // Before complaining about existing index, check if the VCF file isn't + // newer. This is a common source of errors, people tend not to notice + // that tabix failed + stat(fname, &stat_file); + if ( stat_file.st_mtime <= stat_tbi.st_mtime ) + error("[tabix] the index file exists. Please use '-f' to overwrite.\n"); + } + free(idx_fname); + + if ( min_shift > 0 ) // CSI index + { + if ( ftype==IS_BCF ) + { + if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname); + return 0; + } + if ( ftype==IS_BAM ) + { + if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); + return 0; + } + if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname); + return 0; + } + else + { + if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname); + return 0; + } + return 0; +} diff --git a/star-sys/STAR/source/htslib/tbx.c b/star-sys/STAR/source/htslib/tbx.c new file mode 100644 index 0000000..035cb68 --- /dev/null +++ b/star-sys/STAR/source/htslib/tbx.c @@ -0,0 +1,290 @@ +#include +#include +#include +#include +#include +#include "htslib/tbx.h" +#include "htslib/bgzf.h" + +#include "htslib/khash.h" +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) + +tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 }; +tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 }; +tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 }; +tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 }; +tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 }; + +typedef struct { + int64_t beg, end; + char *ss, *se; + int tid; +} tbx_intv_t; + +static inline int get_tid(tbx_t *tbx, const char *ss, int is_add) +{ + khint_t k; + khash_t(s2i) *d; + if (tbx->dict == 0) tbx->dict = kh_init(s2i); + d = (khash_t(s2i)*)tbx->dict; + if (is_add) { + int absent; + k = kh_put(s2i, d, ss, &absent); + if (absent) { + kh_key(d, k) = strdup(ss); + kh_val(d, k) = kh_size(d) - 1; + } + } else k = kh_get(s2i, d, ss); + return k == kh_end(d)? -1 : kh_val(d, k); +} + +int tbx_name2id(tbx_t *tbx, const char *ss) +{ + return get_tid(tbx, ss, 0); +} + +int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) +{ + int i, b = 0, id = 1, ncols = 0; + char *s; + intv->ss = intv->se = 0; intv->beg = intv->end = -1; + for (i = 0; i <= len; ++i) { + if (line[i] == '\t' || line[i] == 0) { + ++ncols; + if (id == conf->sc) { + intv->ss = line + b; intv->se = line + i; + } else if (id == conf->bc) { + // here ->beg is 0-based. + intv->beg = intv->end = strtol(line + b, &s, 0); + if ( s==line+b ) return -1; // expected int + if (!(conf->preset&TBX_UCSC)) --intv->beg; + else ++intv->end; + if (intv->beg < 0) intv->beg = 0; + if (intv->end < 1) intv->end = 1; + } else { + if ((conf->preset&0xffff) == TBX_GENERIC) { + if (id == conf->ec) + { + intv->end = strtol(line + b, &s, 0); + if ( s==line+b ) return -1; // expected int + } + } else if ((conf->preset&0xffff) == TBX_SAM) { + if (id == 6) { // CIGAR + int l = 0, op; + char *t; + for (s = line + b; s < line + i;) { + long x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M' || op == 'D' || op == 'N') l += x; + s = t + 1; + } + if (l == 0) l = 1; + intv->end = intv->beg + l; + } + } else if ((conf->preset&0xffff) == TBX_VCF) { + if (id == 4) { + if (b < i) intv->end = intv->beg + (i - b); + } else if (id == 8) { // look for "END=" + int c = line[i]; + line[i] = 0; + s = strstr(line + b, "END="); + if (s == line + b) s += 4; + else if (s) { + s = strstr(line + b, ";END="); + if (s) s += 5; + } + if (s) intv->end = strtol(s, &s, 0); + line[i] = c; + } + } + } + b = i + 1; + ++id; + } + } + if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; + return 0; +} + +static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add) +{ + if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) { + int c = *intv->se; + *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c; + return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1; + } else { + char *type = NULL; + switch (tbx->conf.preset&0xffff) + { + case TBX_SAM: type = "TBX_SAM"; break; + case TBX_VCF: type = "TBX_VCF"; break; + case TBX_UCSC: type = "TBX_UCSC"; break; + default: type = "TBX_GENERIC"; break; + } + fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s); + return -1; + } +} + +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) +{ + tbx_t *tbx = (tbx_t *) tbxv; + kstring_t *s = (kstring_t *) sv; + int ret; + if ((ret = bgzf_getline(fp, '\n', s)) >= 0) { + tbx_intv_t intv; + get_intv(tbx, s, &intv, 0); + *tid = intv.tid; *beg = intv.beg; *end = intv.end; + } + return ret; +} + +void tbx_set_meta(tbx_t *tbx) +{ + int i, l = 0, l_nm; + uint32_t x[7]; + char **name; + uint8_t *meta; + khint_t k; + khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; + + memcpy(x, &tbx->conf, 24); + name = (char**)malloc(sizeof(char*) * kh_size(d)); + for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) { + if (!kh_exist(d, k)) continue; + name[kh_val(d, k)] = (char*)kh_key(d, k); + l += strlen(kh_key(d, k)) + 1; // +1 to include '\0' + } + l_nm = x[6] = l; + meta = (uint8_t*)malloc(l_nm + 28); + if (ed_is_big()) + for (i = 0; i < 7; ++i) + x[i] = ed_swap_4(x[i]); + memcpy(meta, x, 28); + for (l = 28, i = 0; i < (int)kh_size(d); ++i) { + int x = strlen(name[i]) + 1; + memcpy(meta + l, name[i], x); + l += x; + } + free(name); + hts_idx_set_meta(tbx->idx, l, meta, 0); +} + +tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) +{ + tbx_t *tbx; + kstring_t str; + int ret, first = 0, n_lvls, fmt; + int64_t lineno = 0; + uint64_t last_off = 0; + tbx_intv_t intv; + + str.s = 0; str.l = str.m = 0; + tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); + tbx->conf = *conf; + if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI; + else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { + ++lineno; + if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { + last_off = bgzf_tell(fp); + continue; + } + if (first == 0) { + tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); + first = 1; + } + get_intv(tbx, &str, &intv, 1); + ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1); + if (ret < 0) + { + free(str.s); + tbx_destroy(tbx); + return NULL; + } + } + if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file + if ( !tbx->dict ) tbx->dict = kh_init(s2i); + hts_idx_finish(tbx->idx, bgzf_tell(fp)); + tbx_set_meta(tbx); + free(str.s); + return tbx; +} + +void tbx_destroy(tbx_t *tbx) +{ + khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; + if (d != NULL) + { + khint_t k; + for (k = kh_begin(d); k != kh_end(d); ++k) + if (kh_exist(d, k)) free((char*)kh_key(d, k)); + } + hts_idx_destroy(tbx->idx); + kh_destroy(s2i, d); + free(tbx); +} + +int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf) +{ + tbx_t *tbx; + BGZF *fp; + if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; } + if ((fp = bgzf_open(fn, "r")) == 0) return -1; + if ( !fp->is_compressed ) { bgzf_close(fp); return -1; } + tbx = tbx_index(fp, min_shift, conf); + bgzf_close(fp); + if ( !tbx ) return -1; + hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI); + tbx_destroy(tbx); + return 0; +} + +tbx_t *tbx_index_load(const char *fn) +{ + tbx_t *tbx; + uint8_t *meta; + char *nm, *p; + uint32_t x[7]; + int l_meta, l_nm; + tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); + tbx->idx = hts_idx_load(fn, HTS_FMT_TBI); + if ( !tbx->idx ) + { + free(tbx); + return NULL; + } + meta = hts_idx_get_meta(tbx->idx, &l_meta); + memcpy(x, meta, 28); + memcpy(&tbx->conf, x, 24); + p = nm = (char*)meta + 28; + l_nm = x[6]; + for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1); + return tbx; +} + +const char **tbx_seqnames(tbx_t *tbx, int *n) +{ + khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; + if (d == NULL) + { + *n = 0; + return NULL; + } + int tid, m = kh_size(d); + const char **names = (const char**) calloc(m,sizeof(const char*)); + khint_t k; + for (k=kh_begin(d); k +#include +#include +#include +#include +#include +#include +#include "htslib/kstring.h" +#include "htslib/bgzf.h" +#include "htslib/vcf.h" +#include "htslib/tbx.h" +#include "htslib/hfile.h" + +#include "htslib/khash.h" +KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +typedef khash_t(vdict) vdict_t; + +#include "htslib/kseq.h" +KSTREAM_DECLARE(gzFile, gzread) + +uint32_t bcf_float_missing = 0x7F800001; +uint32_t bcf_float_vector_end = 0x7F800002; +uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; + +/************************* + *** VCF header parser *** + *************************/ + +int bcf_hdr_sync(bcf_hdr_t *h); + +int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) +{ + if ( !s ) + { + bcf_hdr_sync(h); + return 0; + } + + const char *ss = s; + while ( !*ss && isspace(*ss) ) ss++; + if ( !*ss ) + { + fprintf(stderr,"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__); + abort(); + } + + vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE]; + int ret; + char *sdup = strdup(s); + int k = kh_put(vdict, d, sdup, &ret); + if (ret) { // absent + kh_val(d, k) = bcf_idinfo_def; + kh_val(d, k).id = kh_size(d) - 1; + } else { + if (hts_verbose >= 2) + fprintf(stderr, "[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s); + free(sdup); + return -1; + } + int n = kh_size(d); + h->samples = (char**) realloc(h->samples,sizeof(char*)*n); + h->samples[n-1] = sdup; + return 0; +} + +void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) +{ + int i = 0; + const char *p, *q; + // add samples + for (p = q = str;; ++q) { + if (*q != '\t' && *q != 0 && *q != '\n') continue; + if (++i > 9) { + char *s = (char*)malloc(q - p + 1); + strncpy(s, p, q - p); + s[q - p] = 0; + bcf_hdr_add_sample(h,s); + free(s); + } + if (*q == 0 || *q == '\n') break; + p = q + 1; + } + bcf_hdr_add_sample(h,NULL); +} + +int bcf_hdr_sync(bcf_hdr_t *h) +{ + int i; + for (i = 0; i < 3; i++) + { + vdict_t *d = (vdict_t*)h->dict[i]; + khint_t k; + + // find out the largest id, there may be holes because of IDX + int max_id = -1; + for (k=kh_begin(d); k= h->n[i] ) + { + h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t)); + for (k=h->n[i]; k<=max_id; k++) + { + h->id[i][k].key = NULL; + h->id[i][k].val = NULL; + } + h->n[i] = max_id+1; + } + for (k=kh_begin(d); kid[i][kh_val(d,k).id].key = kh_key(d,k); + h->id[i][kh_val(d,k).id].val = &kh_val(d,k); + } + } + return 0; +} + +void bcf_hrec_destroy(bcf_hrec_t *hrec) +{ + free(hrec->key); + if ( hrec->value ) free(hrec->value); + int i; + for (i=0; inkeys; i++) + { + free(hrec->keys[i]); + free(hrec->vals[i]); + } + free(hrec->keys); + free(hrec->vals); + free(hrec); +} + +// Copies all fields except IDX. +bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) +{ + bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); + out->type = hrec->type; + if ( hrec->key ) out->key = strdup(hrec->key); + if ( hrec->value ) out->value = strdup(hrec->value); + out->nkeys = hrec->nkeys; + out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys); + out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys); + int i, j = 0; + for (i=0; inkeys; i++) + { + if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue; + if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]); + if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]); + j++; + } + if ( i!=j ) out->nkeys--; // IDX was omitted + return out; +} + +void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec) +{ + fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:""); + int i; + for (i=0; inkeys; i++) + fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]); + fprintf(fp, "\n"); +} + +void bcf_header_debug(bcf_hdr_t *hdr) +{ + int i, j; + for (i=0; inhrec; i++) + { + if ( !hdr->hrec[i]->value ) + { + fprintf(stderr, "##%s=<", hdr->hrec[i]->key); + fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]); + for (j=1; jhrec[i]->nkeys; j++) + fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]); + fprintf(stderr,">\n"); + } + else + fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value); + } +} + +void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) +{ + int n = ++hrec->nkeys; + hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); + hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); + assert( len ); + hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char)); + memcpy(hrec->keys[n-1],str,len); + hrec->keys[n-1][len] = 0; + hrec->vals[n-1] = NULL; +} + +void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) +{ + if ( !str ) { hrec->vals[i] = NULL; return; } + if ( hrec->vals[i] ) free(hrec->vals[i]); + if ( is_quoted ) + { + hrec->vals[i] = (char*) malloc((len+3)*sizeof(char)); + hrec->vals[i][0] = '"'; + memcpy(&hrec->vals[i][1],str,len); + hrec->vals[i][len+1] = '"'; + hrec->vals[i][len+2] = 0; + } + else + { + hrec->vals[i] = (char*) malloc((len+1)*sizeof(char)); + memcpy(hrec->vals[i],str,len); + hrec->vals[i][len] = 0; + } +} + +void hrec_add_idx(bcf_hrec_t *hrec, int idx) +{ + int n = ++hrec->nkeys; + hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); + hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); + hrec->keys[n-1] = strdup("IDX"); + kstring_t str = {0,0,0}; + kputw(idx, &str); + hrec->vals[n-1] = str.s; +} + +int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) +{ + int i; + for (i=0; inkeys; i++) + if ( !strcasecmp(key,hrec->keys[i]) ) return i; + return -1; +} + +static inline int is_escaped(const char *min, const char *str) +{ + int n = 0; + while ( --str>=min && *str=='\\' ) n++; + return n%2; +} + +bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) +{ + const char *p = line; + if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; } + p += 2; + + const char *q = p; + while ( *q && *q!='=' ) q++; + int n = q-p; + if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format + + bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); + hrec->key = (char*) malloc(sizeof(char)*(n+1)); + memcpy(hrec->key,p,n); + hrec->key[n] = 0; + + p = ++q; + if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 + { + while ( *q && *q!='\n' ) q++; + hrec->value = (char*) malloc((q-p+1)*sizeof(char)); + memcpy(hrec->value, p, q-p); + hrec->value[q-p] = 0; + *len = q-line+1; + return hrec; + } + + // structured line, e.g. ##INFO= + int nopen = 1; + while ( *q && *q!='\n' && nopen ) + { + p = ++q; + while ( *q && *q!='=' ) q++; + n = q-p; + if ( *q!='=' || !n ) { *len = q-line+1; bcf_hrec_destroy(hrec); return NULL; } // wrong format + bcf_hrec_add_key(hrec, p, q-p); + p = ++q; + int quoted = *p=='"' ? 1 : 0; + if ( quoted ) p++, q++; + while (1) + { + if ( !*q ) break; + if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } + else + { + if ( *q=='<' ) nopen++; + if ( *q=='>' ) nopen--; + if ( !nopen ) break; + if ( *q==',' && nopen==1 ) break; + } + q++; + } + bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted); + if ( quoted ) q++; + if ( *q=='>' ) { nopen--; q++; } + } + *len = q-line+1; + return hrec; +} + +// returns: 1 when hdr needs to be synced, 0 otherwise +int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + // contig + int i,j,k, ret; + char *str; + if ( !strcmp(hrec->key, "contig") ) + { + hrec->type = BCF_HL_CTG; + + // Get the contig ID ($str) and length ($j) + i = bcf_hrec_find_key(hrec,"length"); + if ( i<0 ) return 0; + if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; + + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) return 0; + str = strdup(hrec->vals[i]); + + // Register in the dictionary + vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG]; + k = kh_put(vdict, d, str, &ret); + if ( !ret ) { free(str); return 0; } // already present + + int idx = bcf_hrec_find_key(hrec,"IDX"); + if ( idx!=-1 ) + { + char *tmp = hrec->vals[idx]; + idx = strtol(hrec->vals[idx], &tmp, 10); + if ( *tmp ) + { + fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__); + return 0; + } + } + else + { + idx = kh_size(d) - 1; + hrec_add_idx(hrec, idx); + } + + kh_val(d, k) = bcf_idinfo_def; + kh_val(d, k).id = idx; + kh_val(d, k).info[0] = i; + kh_val(d, k).hrec[0] = hrec; + + return 1; + } + + if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; + else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; + else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; + else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; } + else return 0; + + // INFO/FILTER/FORMAT + char *id = NULL; + int type = -1, num = -1, var = -1, idx = -1; + for (i=0; inkeys; i++) + { + if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i]; + else if ( !strcmp(hrec->keys[i], "IDX") ) + { + char *tmp = hrec->vals[i]; + idx = strtol(hrec->vals[i], &tmp, 10); + if ( *tmp ) + { + fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__); + return 0; + } + } + else if ( !strcmp(hrec->keys[i], "Type") ) + { + if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT; + else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL; + else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR; + else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG; + else + { + fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]); + type = BCF_HT_STR; + } + } + else if ( !strcmp(hrec->keys[i], "Number") ) + { + if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A; + else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R; + else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G; + else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR; + else + { + sscanf(hrec->vals[i],"%d",&num); + var = BCF_VL_FIXED; + } + if (var != BCF_VL_FIXED) num = 0xfffff; + + } + } + uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type; + + if ( !id ) return 0; + str = strdup(id); + + vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID]; + k = kh_put(vdict, d, str, &ret); + if ( !ret ) + { + // already present + free(str); + if ( kh_val(d, k).hrec[info&0xf] ) return 0; + kh_val(d, k).info[info&0xf] = info; + kh_val(d, k).hrec[info&0xf] = hrec; + return 1; + } + kh_val(d, k) = bcf_idinfo_def; + kh_val(d, k).info[info&0xf] = info; + kh_val(d, k).hrec[info&0xf] = hrec; + kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx; + + if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id); + + return 1; +} + +int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + hrec->type = BCF_HL_GEN; + if ( !bcf_hdr_register_hrec(hdr,hrec) ) + { + // If one of the hashed field, then it is already present + if ( hrec->type != BCF_HL_GEN ) + { + bcf_hrec_destroy(hrec); + return 0; + } + + // Is one of the generic fields and already present? + int i; + for (i=0; inhrec; i++) + { + if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; + if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break; + if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break; + } + if ( inhrec ) + { + bcf_hrec_destroy(hrec); + return 0; + } + } + + // New record, needs to be added + int n = ++hdr->nhrec; + hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); + hdr->hrec[n-1] = hrec; + + return hrec->type==BCF_HL_GEN ? 0 : 1; +} + +bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *id) +{ + int i; + if ( type==BCF_HL_GEN ) + { + for (i=0; inhrec; i++) + { + if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; + if ( !strcmp(hdr->hrec[i]->key,id) ) return hdr->hrec[i]; + } + return NULL; + } + vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; + khint_t k = kh_get(vdict, d, id); + if ( k == kh_end(d) ) return NULL; + return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type]; +} + +void bcf_hdr_check_sanity(bcf_hdr_t *hdr) +{ + static int PL_warned = 0, GL_warned = 0; + + if ( !PL_warned ) + { + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL"); + if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) + { + fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__); + PL_warned = 1; + } + } + if ( !GL_warned ) + { + int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL"); + if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) + { + fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__); + PL_warned = 1; + } + } +} + +int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) +{ + int len, needs_sync = 0; + char *p = htxt; + + // Check sanity: "fileformat" string must come as first + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); + if ( !hrec->key || strcasecmp(hrec->key,"fileformat") ) + fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__); + needs_sync += bcf_hdr_add_hrec(hdr, hrec); + + // The filter PASS must appear first in the dictionary + hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); + needs_sync += bcf_hdr_add_hrec(hdr, hrec); + + // Parse the whole header + while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) ) + { + needs_sync += bcf_hdr_add_hrec(hdr, hrec); + p += len; + } + bcf_hdr_parse_sample_line(hdr,p); + if ( needs_sync ) bcf_hdr_sync(hdr); + bcf_hdr_check_sanity(hdr); + return 0; +} + +int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) +{ + int len; + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len); + if ( !hrec ) return -1; + if ( bcf_hdr_add_hrec(hdr, hrec) ) + bcf_hdr_sync(hdr); + return 0; +} + +void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) +{ + int i; + bcf_hrec_t *hrec; + while (1) + { + if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) + { + hrec = bcf_hdr_get_hrec(hdr, type, key); + if ( !hrec ) return; + + for (i=0; inhrec; i++) + if ( hdr->hrec[i]==hrec ) break; + assert( inhrec ); + + vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; + khint_t k = kh_get(vdict, d, key); + kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; + } + else + { + for (i=0; inhrec; i++) + { + if ( hdr->hrec[i]->type!=type ) continue; + if ( !strcmp(hdr->hrec[i]->key,key) ) break; + } + if ( i==hdr->nhrec ) return; + hrec = hdr->hrec[i]; + } + + hdr->nhrec--; + if ( i < hdr->nhrec ) + memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); + bcf_hrec_destroy(hrec); + + bcf_hdr_sync(hdr); + } +} + +int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + int n = vsnprintf(NULL, 0, fmt, ap) + 2; + va_end(ap); + + char *line = (char*)malloc(n); + va_start(ap, fmt); + vsnprintf(line, n, fmt, ap); + va_end(ap); + + int ret = bcf_hdr_append(hdr, line); + + free(line); + return ret; +} + + +/********************** + *** BCF header I/O *** + **********************/ + +const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) +{ + bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat"); + if ( !hrec ) + { + fprintf(stderr,"No version string found, assuming VCFv4.2\n"); + return "VCFv4.2"; + } + return hrec->value; +} + +void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) +{ + bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat"); + if ( !hrec ) + { + int len; + kstring_t str = {0,0,0}; + ksprintf(&str,"##fileformat=%s", version); + hrec = bcf_hdr_parse_line(hdr, str.s, &len); + free(str.s); + } + else + { + free(hrec->value); + hrec->value = strdup(version); + } + bcf_hdr_sync(hdr); +} + +bcf_hdr_t *bcf_hdr_init(const char *mode) +{ + int i; + bcf_hdr_t *h; + h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); + for (i = 0; i < 3; ++i) + h->dict[i] = kh_init(vdict); + if ( strchr(mode,'w') ) + { + bcf_hdr_append(h, "##fileformat=VCFv4.2"); + // The filter PASS must appear first in the dictionary + bcf_hdr_append(h, "##FILTER="); + } + return h; +} + +void bcf_hdr_destroy(bcf_hdr_t *h) +{ + int i; + khint_t k; + for (i = 0; i < 3; ++i) { + vdict_t *d = (vdict_t*)h->dict[i]; + if (d == 0) continue; + for (k = kh_begin(d); k != kh_end(d); ++k) + if (kh_exist(d, k)) free((char*)kh_key(d, k)); + kh_destroy(vdict, d); + free(h->id[i]); + } + for (i=0; inhrec; i++) + bcf_hrec_destroy(h->hrec[i]); + if (h->nhrec) free(h->hrec); + if (h->samples) free(h->samples); + free(h->keep_samples); + free(h->transl[0]); free(h->transl[1]); + free(h->mem.s); + free(h); +} + +bcf_hdr_t *bcf_hdr_read(htsFile *hfp) +{ + if (!hfp->is_bin) + return vcf_hdr_read(hfp); + + BGZF *fp = hfp->fp.bgzf; + uint8_t magic[5]; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + if ( bgzf_read(fp, magic, 5)<0 ) + { + fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__); + return NULL; + } + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) + { + if (!strncmp((char*)magic, "BCF", 3)) + fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__); + else if (hts_verbose >= 2) + fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__); + bcf_hdr_destroy(h); + return 0; + } + int hlen; + char *htxt; + bgzf_read(fp, &hlen, 4); + htxt = (char*)malloc(hlen); + bgzf_read(fp, htxt, hlen); + bcf_hdr_parse(h, htxt); + free(htxt); + return h; +} + +int bcf_hdr_write(htsFile *hfp, const bcf_hdr_t *h) +{ + if (!hfp->is_bin) return vcf_hdr_write(hfp, h); + + int hlen; + char *htxt = bcf_hdr_fmt_text(h, 1, &hlen); + hlen++; // include the \0 byte + + BGZF *fp = hfp->fp.bgzf; + if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1; + if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1; + if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1; + + free(htxt); + return 0; +} + +/******************** + *** BCF site I/O *** + ********************/ + +bcf1_t *bcf_init1() +{ + bcf1_t *v; + v = (bcf1_t*)calloc(1, sizeof(bcf1_t)); + return v; +} + +void bcf_clear(bcf1_t *v) +{ + int i; + for (i=0; id.m_info; i++) + { + if ( v->d.info[i].vptr_free ) + { + free(v->d.info[i].vptr - v->d.info[i].vptr_off); + v->d.info[i].vptr_free = 0; + } + } + for (i=0; id.m_fmt; i++) + { + if ( v->d.fmt[i].p_free ) + { + free(v->d.fmt[i].p - v->d.fmt[i].p_off); + v->d.fmt[i].p_free = 0; + } + } + v->rid = v->pos = v->rlen = v->unpacked = 0; + v->unpack_ptr = NULL; + bcf_float_set_missing(v->qual); + v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0; + v->shared.l = v->indiv.l = 0; + v->d.var_type = -1; + v->d.shared_dirty = 0; + v->d.indiv_dirty = 0; + v->d.n_flt = 0; + v->errcode = 0; + if (v->d.m_als) v->d.als[0] = 0; + if (v->d.m_id) v->d.id[0] = 0; +} + +void bcf_empty1(bcf1_t *v) +{ + bcf_clear1(v); + free(v->d.id); + free(v->d.als); + free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt); + if (v->d.var ) free(v->d.var); + free(v->shared.s); free(v->indiv.s); +} + +void bcf_destroy1(bcf1_t *v) +{ + bcf_empty1(v); + free(v); +} + +static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) +{ + uint32_t x[8]; + int ret; + if ((ret = bgzf_read(fp, x, 32)) != 32) { + if (ret == 0) return -1; + return -2; + } + bcf_clear1(v); + x[0] -= 24; // to exclude six 32-bit integers + ks_resize(&v->shared, x[0]); + ks_resize(&v->indiv, x[1]); + memcpy(v, x + 2, 16); + v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; + v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; + v->shared.l = x[0], v->indiv.l = x[1]; + + // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 + if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; + + bgzf_read(fp, v->shared.s, v->shared.l); + bgzf_read(fp, v->indiv.s, v->indiv.l); + return 0; +} + +#define bit_array_size(n) ((n)/8+1) +#define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8)) +#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8))) +#define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8))) + +static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); +int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) +{ + if ( !hdr->keep_samples ) return 0; + if ( !bcf_hdr_nsamples(hdr) ) + { + rec->indiv.l = rec->n_sample = 0; + return 0; + } + + int i, j; + uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src; + bcf_dec_t *dec = &rec->d; + hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt); + for (i=0; im_fmt; ++i) dec->fmt[i].p_free = 0; + + for (i=0; in_fmt; i++) + { + ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]); + src = dec->fmt[i].p - dec->fmt[i].size; + if ( dst ) + { + memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off); + dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off; + } + dst = dec->fmt[i].p; + for (j=0; jnsamples_ori; j++) + { + src += dec->fmt[i].size; + if ( !bit_array_test(hdr->keep_samples,j) ) continue; + memmove(dst, src, dec->fmt[i].size); + dst += dec->fmt[i].size; + } + rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p); + dec->fmt[i].p_len = dst - dec->fmt[i].p; + } + rec->unpacked |= BCF_UN_FMT; + + rec->n_sample = bcf_hdr_nsamples(hdr); + return 0; +} + +int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) +{ + if (!fp->is_bin) return vcf_read(fp,h,v); + int ret = bcf_read1_core(fp->fp.bgzf, v); + if ( ret!=0 || !h->keep_samples ) return ret; + return bcf_subset_format(h,v); +} + +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) +{ + bcf1_t *v = (bcf1_t *) vv; + int ret; + if ((ret = bcf_read1_core(fp, v)) >= 0) + *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; + return ret; +} + +static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str) +{ + // single typed string + if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id); + else bcf_enc_size(str, 0, BCF_BT_CHAR); +} +static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str) +{ + // list of typed strings + int i; + for (i=0; in_allele; i++) + bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]); + line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; // beware: this neglects SV's END tag +} +static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str) +{ + // typed vector of integers + if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1); + else bcf_enc_vint(str, 0, 0, -1); +} +static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str) +{ + // pairs of typed vectors + int i, irm = -1; + for (i=0; in_info; i++) + { + bcf_info_t *info = &line->d.info[i]; + if ( !info->vptr ) + { + // marked for removal + if ( irm < 0 ) irm = i; + continue; + } + kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str); + if ( irm >=0 ) + { + bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp; + while ( irm<=i && line->d.info[irm].vptr ) irm++; + } + } + if ( irm>=0 ) line->n_info = irm; +} + +static int bcf1_sync(bcf1_t *line) +{ + kstring_t tmp = {0,0,0}; + if ( !line->shared.l ) + { + // New line, get ready for BCF output + tmp = line->shared; + bcf1_sync_id(line, &tmp); + bcf1_sync_alleles(line, &tmp); + bcf1_sync_filter(line, &tmp); + bcf1_sync_info(line, &tmp); + line->shared = tmp; + } + else if ( line->d.shared_dirty ) + { + // The line was edited, update the BCF data block, ptr_ori points + // to the original unchanged BCF data. + uint8_t *ptr_ori = (uint8_t *) line->shared.s; + + // ID: single typed string + if ( line->d.shared_dirty & BCF1_DIRTY_ID ) + bcf1_sync_id(line, &tmp); + else + kputsn_(ptr_ori, line->unpack_size[0], &tmp); + ptr_ori += line->unpack_size[0]; + + // REF+ALT: list of typed strings + if ( line->d.shared_dirty & BCF1_DIRTY_ALS ) + bcf1_sync_alleles(line, &tmp); + else + { + kputsn_(ptr_ori, line->unpack_size[1], &tmp); + line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0; // beware: this neglects SV's END tag + } + ptr_ori += line->unpack_size[1]; + + // FILTER: typed vector of integers + if ( line->d.shared_dirty & BCF1_DIRTY_FLT ) + bcf1_sync_filter(line, &tmp); + else if ( line->d.n_flt ) + kputsn_(ptr_ori, line->unpack_size[2], &tmp); + else + bcf_enc_vint(&tmp, 0, 0, -1); + ptr_ori += line->unpack_size[2]; + + // INFO: pairs of typed vectors + if ( line->d.shared_dirty & BCF1_DIRTY_INF ) + bcf1_sync_info(line, &tmp); + else + { + int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s; + kputsn_(ptr_ori, size, &tmp); + } + free(line->shared.s); + line->shared = tmp; + } + if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) ) + { + // The genotype fields changed or are not present + tmp.l = tmp.m = 0; tmp.s = NULL; + int i, irm = -1; + for (i=0; in_fmt; i++) + { + bcf_fmt_t *fmt = &line->d.fmt[i]; + if ( !fmt->p ) + { + // marked for removal + if ( irm < 0 ) irm = i; + continue; + } + kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp); + if ( irm >=0 ) + { + bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt; + while ( irm<=i && line->d.fmt[irm].p ) irm++; + } + + } + if ( irm>=0 ) line->n_fmt = irm; + free(line->indiv.s); + line->indiv = tmp; + } + if ( !line->n_sample ) line->n_fmt = 0; + line->d.shared_dirty = line->d.indiv_dirty = 0; + return 0; +} + +bcf1_t *bcf_dup(bcf1_t *src) +{ + bcf1_sync(src); + + bcf1_t *out = bcf_init1(); + + out->rid = src->rid; + out->pos = src->pos; + out->rlen = src->rlen; + out->qual = src->qual; + out->n_info = src->n_info; out->n_allele = src->n_allele; + out->n_fmt = src->n_fmt; out->n_sample = src->n_sample; + + out->shared.m = out->shared.l = src->shared.l; + out->shared.s = (char*) malloc(out->shared.l); + memcpy(out->shared.s,src->shared.s,out->shared.l); + + out->indiv.m = out->indiv.l = src->indiv.l; + out->indiv.s = (char*) malloc(out->indiv.l); + memcpy(out->indiv.s,src->indiv.s,out->indiv.l); + + return out; +} + +int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v) +{ + if ( bcf_hdr_nsamples(h)!=v->n_sample ) + { + fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n", + __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h)); + return -1; + } + + if ( !hfp->is_bin ) return vcf_write(hfp,h,v); + + if ( v->errcode ) + { + // vcf_parse1() encountered a new contig or tag, undeclared in the + // header. At this point, the header must have been printed, + // proceeding would lead to a broken BCF file. Errors must be checked + // and cleared by the caller before we can proceed. + fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode); + exit(1); + } + bcf1_sync(v); // check if the BCF record was modified + + BGZF *fp = hfp->fp.bgzf; + uint32_t x[8]; + x[0] = v->shared.l + 24; // to include six 32-bit integers + x[1] = v->indiv.l; + memcpy(x + 2, v, 16); + x[6] = (uint32_t)v->n_allele<<16 | v->n_info; + x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; + if ( bgzf_write(fp, x, 32) != 32 ) return -1; + if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; + if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; + return 0; +} + +/********************** + *** VCF header I/O *** + **********************/ + +bcf_hdr_t *vcf_hdr_read(htsFile *fp) +{ + kstring_t txt, *s = &fp->line; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + txt.l = txt.m = 0; txt.s = 0; + while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { + if (s->l == 0) continue; + if (s->s[0] != '#') { + if (hts_verbose >= 2) + fprintf(stderr, "[E::%s] no sample line\n", __func__); + free(txt.s); + bcf_hdr_destroy(h); + return 0; + } + if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here + int dret; + gzFile f; + kstream_t *ks; + kstring_t tmp; + tmp.l = tmp.m = 0; tmp.s = 0; + f = gzopen(fp->fn_aux, "r"); + ks = ks_init(f); + while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { + int c; + kputs("##contig=\n", 2, &txt); + if (dret != '\n') + while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line + } + free(tmp.s); + ks_destroy(ks); + gzclose(f); + } + kputsn(s->s, s->l, &txt); + kputc('\n', &txt); + if (s->s[1] != '#') break; + } + if ( !txt.s ) + { + fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__); + return NULL; + } + bcf_hdr_parse(h, txt.s); + + // check tabix index, are all contigs listed in the header? add the missing ones + tbx_t *idx = tbx_index_load(fp->fn); + if ( idx ) + { + int i, n, need_sync = 0; + const char **names = tbx_seqnames(idx, &n); + for (i=0; ikey = strdup("contig"); + bcf_hrec_add_key(hrec, "ID", strlen("ID")); + bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); + bcf_hrec_add_key(hrec, "length", strlen("length")); + bcf_hrec_set_val(hrec, hrec->nkeys-1, "2147483647", strlen("2147483647"), 0); + bcf_hdr_add_hrec(h, hrec); + need_sync = 1; + } + free(names); + tbx_destroy(idx); + if ( need_sync ) + bcf_hdr_sync(h); + } + free(txt.s); + return h; +} + +int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) +{ + int i, n; + char **lines = hts_readlines(fname, &n); + if ( !lines ) return 1; + for (i=0; ivalue ) + { + int j, nout = 0; + ksprintf(str, "##%s=<", hrec->key); + for (j=0; jnkeys; j++) + { + // do not output IDX if output is VCF + if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue; + if ( nout ) kputc(',',str); + ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]); + nout++; + } + ksprintf(str,">\n"); + } + else + ksprintf(str,"##%s=%s\n", hrec->key,hrec->value); +} + +void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) +{ + _bcf_hrec_format(hrec,0,str); +} +char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) +{ + int i; + kstring_t txt = {0,0,0}; + for (i=0; inhrec; i++) + _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt); + + ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); + if ( bcf_hdr_nsamples(hdr) ) + { + ksprintf(&txt,"\tFORMAT"); + for (i=0; isamples[i]); + } + ksprintf(&txt,"\n"); + + if ( len ) *len = txt.l; + return txt.s; +} + +const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) +{ + vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; + int tid, m = kh_size(d); + const char **names = (const char**) calloc(m,sizeof(const char*)); + khint_t k; + for (k=kh_begin(d); kis_compressed==1 ) + ret = bgzf_write(fp->fp.bgzf, htxt, hlen); + else + ret = hwrite(fp->fp.hfile, htxt, hlen); + free(htxt); + return ret<0 ? -1 : 0; +} + +/*********************** + *** Typed value I/O *** + ***********************/ + +void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) +{ + int32_t max = INT32_MIN + 1, min = INT32_MAX; + int i; + if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL); + else if (n == 1) bcf_enc_int1(s, a[0]); + else { + if (wsize <= 0) wsize = n; + for (i = 0; i < n; ++i) { + if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue; + if (max < a[i]) max = a[i]; + if (min > a[i]) min = a[i]; + } + if (max <= INT8_MAX && min > bcf_int8_vector_end) { + bcf_enc_size(s, wsize, BCF_BT_INT8); + for (i = 0; i < n; ++i) + if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); + else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); + else kputc(a[i], s); + } else if (max <= INT16_MAX && min > bcf_int16_vector_end) { + bcf_enc_size(s, wsize, BCF_BT_INT16); + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; + else x = a[i]; + kputsn((char*)&x, 2, s); + } + } else { + bcf_enc_size(s, wsize, BCF_BT_INT32); + for (i = 0; i < n; ++i) { + int32_t x = a[i]; + kputsn((char*)&x, 4, s); + } + } + } +} + +void bcf_enc_vfloat(kstring_t *s, int n, float *a) +{ + bcf_enc_size(s, n, BCF_BT_FLOAT); + kputsn((char*)a, n << 2, s); +} + +void bcf_enc_vchar(kstring_t *s, int l, const char *a) +{ + bcf_enc_size(s, l, BCF_BT_CHAR); + kputsn(a, l, s); +} + +void bcf_fmt_array(kstring_t *s, int n, int type, void *data) +{ + int j = 0; + if (n == 0) { + kputc('.', s); + return; + } + if (type == BCF_BT_CHAR) + { + char *p = (char*)data; + for (j = 0; j < n && *p; ++j, ++p) + { + if ( *p==bcf_str_missing ) kputc('.', s); + else kputc(*p, s); + } + } + else + { + #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \ + type_t *p = (type_t *) data; \ + for (j=0; jl&7) { + uint64_t zero = 0; + int l = ((s->l + 7)>>3<<3) - s->l; + kputsn((char*)&zero, l, s); + } +} + +// p,q is the start and the end of the FORMAT field +int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) +{ + if ( !bcf_hdr_nsamples(h) ) return 0; + + char *r, *t; + int j, l, m, g; + khint_t k; + ks_tokaux_t aux1; + vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + kstring_t *mem = (kstring_t*)&h->mem; + mem->l = 0; + + // count the number of format fields + for (r = p, v->n_fmt = 1; *r; ++r) + if (*r == ':') ++v->n_fmt; + char *end = s->s + s->l; + if ( q>=end ) + { + fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1); + return -1; + } + + fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t)); + // get format information from the dictionary + for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { + *(char*)aux1.p = 0; + k = kh_get(vdict, d, t); + if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { + fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t); + kstring_t tmp = {0,0,0}; + int l; + ksprintf(&tmp, "##FORMAT=", t); + bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); + free(tmp.s); + if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + k = kh_get(vdict, d, t); + v->errcode = BCF_ERR_TAG_UNDEF; + } + fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; + fmt[j].key = kh_val(d, k).id; + fmt[j].is_gt = !strcmp(t, "GT"); + fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; + } + // compute max + int n_sample_ori = -1; + r = q + 1; // r: position in the format string + m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + while ( rkeep_samples ) + { + n_sample_ori++; + if ( !bit_array_test(h->keep_samples,n_sample_ori) ) + { + while ( *r!='\t' && r=end ) break; + r++; l++; + } + v->n_sample++; + if ( v->n_sample == bcf_hdr_nsamples(h) ) break; + r++; + } + + // allocate memory for arrays + for (j = 0; j < v->n_fmt; ++j) { + fmt_aux_t *f = &fmt[j]; + if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + if ((f->y>>4&0xf) == BCF_HT_STR) { + f->size = f->is_gt? f->max_g << 2 : f->max_l; + } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { + f->size = f->max_m << 2; + } else + { + fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf); + abort(); // I do not know how to do with Flag in the genotype fields + } + align_mem(mem); + f->offset = mem->l; + ks_resize(mem, mem->l + v->n_sample * f->size); + mem->l += v->n_sample * f->size; + } + for (j = 0; j < v->n_fmt; ++j) + fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; + // fill the sample fields; at beginning of the loop, t points to the first char of a format + n_sample_ori = -1; + t = q + 1; m = 0; // m: sample id + while ( tkeep_samples ) + { + n_sample_ori++; + if ( !bit_array_test(h->keep_samples,n_sample_ori) ) + { + while ( *t && ty>>4&0xf) == BCF_HT_STR) { + if (z->is_gt) { // genotypes + int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m); + for (l = 0;; ++t) { + if (*t == '.') ++t, x[l++] = is_phased; + else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased; + #if THOROUGH_SANITY_CHECKS + assert( 0 ); // success of strtol,strtod not checked + #endif + is_phased = (*t == '|'); + if (*t == ':' || *t == 0) break; + } + if ( !l ) x[l++] = 0; // An empty field, insert missing value + for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + } else { + char *x = (char*)z->buf + z->size * m; + for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t; + for (; l < z->size; ++l) x[l] = 0; + } + } else if ((z->y>>4&0xf) == BCF_HT_INT) { + int32_t *x = (int32_t*)(z->buf + z->size * m); + for (l = 0;; ++t) { + if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "." + else x[l++] = strtol(t, &t, 10); + if (*t == ':' || *t == 0) break; + } + if ( !l ) x[l++] = bcf_int32_missing; + for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + float *x = (float*)(z->buf + z->size * m); + for (l = 0;; ++t) { + if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." + else x[l++] = strtod(t, &t); + if (*t == ':' || *t == 0) break; + } + if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value + for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); + } else abort(); + if (*t == 0) { + for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values + z = &fmt[j]; + if ((z->y>>4&0xf) == BCF_HT_STR) { + if (z->is_gt) { + int32_t *x = (int32_t*)(z->buf + z->size * m); + x[0] = bcf_int32_missing; + for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + } else { + char *x = (char*)z->buf + z->size * m; + if ( z->size ) x[0] = '.'; + for (l = 1; l < z->size; ++l) x[l] = 0; + } + } else if ((z->y>>4&0xf) == BCF_HT_INT) { + int32_t *x = (int32_t*)(z->buf + z->size * m); + x[0] = bcf_int32_missing; + for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + float *x = (float*)(z->buf + z->size * m); + bcf_float_set_missing(x[0]); + for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); + } + } + break; + } + else + { + if (*t == ':') ++j; + t++; + } + } + m++; t++; + } + + // write individual genotype information + kstring_t *str = &v->indiv; + int i; + if (v->n_sample > 0) { + for (i = 0; i < v->n_fmt; ++i) { + fmt_aux_t *z = &fmt[i]; + bcf_enc_int1(str, z->key); + if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) { + bcf_enc_size(str, z->size, BCF_BT_CHAR); + kputsn((char*)z->buf, z->size * v->n_sample, str); + } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) { + bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2); + } else { + bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT); + kputsn((char*)z->buf, z->size * v->n_sample, str); + } + } + } + + if ( v->n_sample!=bcf_hdr_nsamples(h) ) + { + fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n", + __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h)); + v->errcode |= BCF_ERR_NCOLS; + return -1; + } + + return 0; +} + +int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) +{ + int i = 0; + char *p, *q, *r, *t; + kstring_t *str; + khint_t k; + ks_tokaux_t aux; + + bcf_clear1(v); + str = &v->shared; + memset(&aux, 0, sizeof(ks_tokaux_t)); + for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) { + q = (char*)aux.p; + *q = 0; + if (i == 0) { // CHROM + vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; + k = kh_get(vdict, d, p); + if (k == kh_end(d)) + { + // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has + // been already printed, but will enable tools like vcfcheck to proceed. + fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p); + kstring_t tmp = {0,0,0}; + int l; + ksprintf(&tmp, "##contig=", p); + bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); + free(tmp.s); + if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + k = kh_get(vdict, d, p); + v->errcode = BCF_ERR_CTG_UNDEF; + } + v->rid = kh_val(d, k).id; + } else if (i == 1) { // POS + v->pos = atoi(p) - 1; + } else if (i == 2) { // ID + if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); + else bcf_enc_size(str, 0, BCF_BT_CHAR); + } else if (i == 3) { // REF + bcf_enc_vchar(str, q - p, p); + v->n_allele = 1, v->rlen = q - p; + } else if (i == 4) { // ALT + if (strcmp(p, ".")) { + for (r = t = p;; ++r) { + if (*r == ',' || *r == 0) { + bcf_enc_vchar(str, r - t, t); + t = r + 1; + ++v->n_allele; + } + if (r == q) break; + } + } + } else if (i == 5) { // QUAL + if (strcmp(p, ".")) v->qual = atof(p); + else memcpy(&v->qual, &bcf_float_missing, 4); + if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR + } else if (i == 6) { // FILTER + if (strcmp(p, ".")) { + int32_t *a; + int n_flt = 1, i; + ks_tokaux_t aux1; + vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + // count the number of filters + if (*(q-1) == ';') *(q-1) = 0; + for (r = p; *r; ++r) + if (*r == ';') ++n_flt; + a = (int32_t*)alloca(n_flt * 4); + // add filters + for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) { + *(char*)aux1.p = 0; + k = kh_get(vdict, d, t); + if (k == kh_end(d)) + { + // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has + // been already printed, but will enable tools like vcfcheck to proceed. + fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t); + kstring_t tmp = {0,0,0}; + int l; + ksprintf(&tmp, "##FILTER=", t); + bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); + free(tmp.s); + if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + k = kh_get(vdict, d, t); + v->errcode = BCF_ERR_TAG_UNDEF; + } + a[i++] = kh_val(d, k).id; + } + n_flt = i; + bcf_enc_vint(str, n_flt, a, -1); + } else bcf_enc_vint(str, 0, 0, -1); + if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT + } else if (i == 7) { // INFO + char *key; + vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + v->n_info = 0; + if (strcmp(p, ".")) { + if (*(q-1) == ';') *(q-1) = 0; + for (r = key = p;; ++r) { + int c; + char *val, *end; + if (*r != ';' && *r != '=' && *r != 0) continue; + val = end = 0; + c = *r; *r = 0; + if (c == '=') { + val = r + 1; + for (end = val; *end != ';' && *end != 0; ++end); + c = *end; *end = 0; + } else end = r; + k = kh_get(vdict, d, key); + if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) + { + fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key); + kstring_t tmp = {0,0,0}; + int l; + ksprintf(&tmp, "##INFO=", key); + bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); + free(tmp.s); + if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + k = kh_get(vdict, d, key); + v->errcode = BCF_ERR_TAG_UNDEF; + } + uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; + ++v->n_info; + bcf_enc_int1(str, kh_val(d, k).id); + if (val == 0) { + bcf_enc_size(str, 0, BCF_BT_NULL); + } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string + bcf_enc_vchar(str, end - val, val); + } else { // int/float value/array + int i, n_val; + char *t, *te; + for (t = val, n_val = 1; *t; ++t) // count the number of values + if (*t == ',') ++n_val; + if ((y>>4&0xf) == BCF_HT_INT) { + int32_t *z; + z = (int32_t*)alloca(n_val<<2); + for (i = 0, t = val; i < n_val; ++i, ++t) + { + z[i] = strtol(t, &te, 10); + if ( te==t ) // conversion failed + { + z[i] = bcf_int32_missing; + while ( *te && *te!=',' ) te++; + } + t = te; + } + bcf_enc_vint(str, n_val, z, -1); + if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos; + } else if ((y>>4&0xf) == BCF_HT_REAL) { + float *z; + z = (float*)alloca(n_val<<2); + for (i = 0, t = val; i < n_val; ++i, ++t) + { + z[i] = strtod(t, &te); + if ( te==t ) // conversion failed + { + bcf_float_set_missing(z[i]); + while ( *te && *te!=',' ) te++; + } + t = te; + } + bcf_enc_vfloat(str, n_val, z); + } + } + if (c == 0) break; + r = end; + key = r + 1; + } + } + if ( v->max_unpack && !(v->max_unpack>>3) ) return 0; + } else if (i == 8) // FORMAT + return _vcf_parse_format(s, h, v, p, q); + } + return 0; +} + +int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) +{ + int ret; + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); + if (ret < 0) return -1; + return vcf_parse1(&fp->line, h, v); +} + +static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt) +{ + uint8_t *ptr_start = ptr; + fmt->id = bcf_dec_typed_int1(ptr, &ptr); + fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type); + fmt->size = fmt->n << bcf_type_shift[fmt->type]; + fmt->p = ptr; + fmt->p_off = ptr - ptr_start; + fmt->p_free = 0; + ptr += n_sample * fmt->size; + fmt->p_len = ptr - fmt->p; + return ptr; +} + +static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) +{ + uint8_t *ptr_start = ptr; + info->key = bcf_dec_typed_int1(ptr, &ptr); + info->len = bcf_dec_size(ptr, &ptr, &info->type); + info->vptr = ptr; + info->vptr_off = ptr - ptr_start; + info->vptr_free = 0; + info->v1.i = 0; + if (info->len == 1) { + if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr; + else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr; + else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr; + else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr; + } + ptr += info->len << bcf_type_shift[info->type]; + info->vptr_len = ptr - info->vptr; + return ptr; +} + +int bcf_unpack(bcf1_t *b, int which) +{ + if ( !b->shared.l ) return 0; // Building a new BCF record from scratch + uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori; + int *offset, i; + bcf_dec_t *d = &b->d; + if (which & BCF_UN_FLT) which |= BCF_UN_STR; + if (which & BCF_UN_INFO) which |= BCF_UN_SHR; + if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR)) + { + kstring_t tmp; + + // ID + tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id; + ptr_ori = ptr; + ptr = bcf_fmt_sized_array(&tmp, ptr); + b->unpack_size[0] = ptr - ptr_ori; + kputc('\0', &tmp); + d->id = tmp.s; d->m_id = tmp.m; + + // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block + tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als; + offset = (int*)alloca(b->n_allele * sizeof(int)); + ptr_ori = ptr; + for (i = 0; i < b->n_allele; ++i) { + offset[i] = tmp.l; + ptr = bcf_fmt_sized_array(&tmp, ptr); + kputc('\0', &tmp); + } + b->unpack_size[1] = ptr - ptr_ori; + d->als = tmp.s; d->m_als = tmp.m; + + hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro + for (i = 0; i < b->n_allele; ++i) + d->allele[i] = d->als + offset[i]; + b->unpack_ptr = ptr; + b->unpacked |= BCF_UN_STR; + } + if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER + ptr = b->unpack_ptr; + ptr_ori = ptr; + if (*ptr>>4) { + int type; + d->n_flt = bcf_dec_size(ptr, &ptr, &type); + hts_expand(int, d->n_flt, d->m_flt, d->flt); + for (i = 0; i < d->n_flt; ++i) + d->flt[i] = bcf_dec_int1(ptr, type, &ptr); + } else ++ptr, d->n_flt = 0; + b->unpack_size[2] = ptr - ptr_ori; + b->unpack_ptr = ptr; + b->unpacked |= BCF_UN_FLT; + } + if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO + ptr = b->unpack_ptr; + hts_expand(bcf_info_t, b->n_info, d->m_info, d->info); + for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0; + for (i = 0; i < b->n_info; ++i) + ptr = bcf_unpack_info_core1(ptr, &d->info[i]); + b->unpacked |= BCF_UN_INFO; + } + if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT + ptr = (uint8_t*)b->indiv.s; + hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt); + for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0; + for (i = 0; i < b->n_fmt; ++i) + ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]); + b->unpacked |= BCF_UN_FMT; + } + return 0; +} + +int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) +{ + int i; + bcf_unpack((bcf1_t*)v, BCF_UN_ALL); + kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM + kputc('\t', s); kputw(v->pos + 1, s); // POS + kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID + kputc('\t', s); // REF + if (v->n_allele > 0) kputs(v->d.allele[0], s); + else kputc('.', s); + kputc('\t', s); // ALT + if (v->n_allele > 1) { + for (i = 1; i < v->n_allele; ++i) { + if (i > 1) kputc(',', s); + kputs(v->d.allele[i], s); + } + } else kputc('.', s); + kputc('\t', s); // QUAL + if (memcmp(&v->qual, &bcf_float_missing, 4) == 0) kputc('.', s); // QUAL + else ksprintf(s, "%g", v->qual); + kputc('\t', s); // FILTER + if (v->d.n_flt) { + for (i = 0; i < v->d.n_flt; ++i) { + if (i) kputc(';', s); + kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s); + } + } else kputc('.', s); + kputc('\t', s); // INFO + if (v->n_info) { + int first = 1; + for (i = 0; i < v->n_info; ++i) { + bcf_info_t *z = &v->d.info[i]; + if ( !z->vptr ) continue; + if ( !first ) kputc(';', s); first = 0; + kputs(h->id[BCF_DT_ID][z->key].key, s); + if (z->len <= 0) continue; + kputc('=', s); + if (z->len == 1) { + if (z->type == BCF_BT_FLOAT) ksprintf(s, "%g", z->v1.f); + else if (z->type != BCF_BT_CHAR) kputw(z->v1.i, s); + else kputc(z->v1.i, s); + } else bcf_fmt_array(s, z->len, z->type, z->vptr); + } + if ( first ) kputc('.', s); + } else kputc('.', s); + // FORMAT and individual information + if (v->n_sample) + { + int i,j; + if ( v->n_fmt) + { + int gt_i = -1; + bcf_fmt_t *fmt = v->d.fmt; + int first = 1; + for (i = 0; i < (int)v->n_fmt; ++i) { + if ( !fmt[i].p ) continue; + kputc(!first ? ':' : '\t', s); first = 0; + if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) + { + fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id); + abort(); + } + kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); + if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; + } + if ( first ) kputs("\t.", s); + for (j = 0; j < v->n_sample; ++j) { + kputc('\t', s); + first = 1; + for (i = 0; i < (int)v->n_fmt; ++i) { + bcf_fmt_t *f = &fmt[i]; + if ( !f->p ) continue; + if (!first) kputc(':', s); first = 0; + if (gt_i == i) + bcf_format_gt(f,j,s); + else + bcf_fmt_array(s, f->n, f->type, f->p + j * f->size); + } + if ( first ) kputc('.', s); + } + } + else + for (j=0; j<=v->n_sample; j++) + kputs("\t.", s); + } + kputc('\n', s); + return 0; +} + +int vcf_write_line(htsFile *fp, kstring_t *line) +{ + int ret; + if ( line->s[line->l-1]!='\n' ) kputc('\n',line); + if ( fp->is_compressed==1 ) + ret = bgzf_write(fp->fp.bgzf, line->s, line->l); + else + ret = hwrite(fp->fp.hfile, line->s, line->l); + return ret==line->l ? 0 : -1; +} + +int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) +{ + int ret; + fp->line.l = 0; + vcf_format1(h, v, &fp->line); + if ( fp->is_compressed==1 ) + ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); + else + ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + return ret==fp->line.l ? 0 : -1; +} + +/************************ + * Data access routines * + ************************/ + +int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) +{ + khint_t k; + vdict_t *d = (vdict_t*)h->dict[which]; + k = kh_get(vdict, d, id); + return k == kh_end(d)? -1 : kh_val(d, k).id; +} + + +/******************** + *** BCF indexing *** + ********************/ + +hts_idx_t *bcf_index(htsFile *fp, int min_shift) +{ + int n_lvls, i; + bcf1_t *b; + hts_idx_t *idx; + bcf_hdr_t *h; + int64_t max_len = 0, s; + h = bcf_hdr_read(fp); + if ( !h ) return NULL; + int nids = 0; + for (i = 0; i < h->n[BCF_DT_CTG]; ++i) + { + if ( !h->id[BCF_DT_CTG][i].val ) continue; + if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0]; + nids++; + } + if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. + max_len += 256; + for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); + b = bcf_init1(); + while (bcf_read1(fp,h, b) >= 0) { + int ret; + ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1); + if (ret < 0) + { + bcf_destroy1(b); + hts_idx_destroy(idx); + return NULL; + } + } + hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); + bcf_destroy1(b); + bcf_hdr_destroy(h); + return idx; +} + +int bcf_index_build(const char *fn, int min_shift) +{ + htsFile *fp; + hts_idx_t *idx; + if ((fp = hts_open(fn, "rb")) == 0) return -1; + if ( !fp->fp.bgzf->is_compressed ) { hts_close(fp); return -1; } + idx = bcf_index(fp, min_shift); + hts_close(fp); + if ( !idx ) return -1; + hts_idx_save(idx, fn, HTS_FMT_CSI); + hts_idx_destroy(idx); + return 0; +} + +/***************** + *** Utilities *** + *****************/ + +void bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) +{ + int i, ndst_ori = dst->nhrec, need_sync = 0; + for (i=0; inhrec; i++) + { + if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) + { + int j; + for (j=0; jhrec[j]->type!=BCF_HL_GEN ) continue; + if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) && !strcmp(src->hrec[i]->value,dst->hrec[j]->value) ) break; + } + if ( j>=ndst_ori ) + need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + } + else + { + bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, src->hrec[i]->vals[0]); + if ( !rec ) + need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + } + } + if ( need_sync ) bcf_hdr_sync(dst); +} +int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) +{ + int i; + if ( line->errcode ) + { + fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode); + exit(1); + } + if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id + if ( !src_hdr->ntransl ) // called for the first time, see what needs translating + { + int dict; + for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG + { + src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int)); + for (i=0; in[dict]; i++) + { + if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) ) + { + src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key); + src_hdr->ntransl++; + } + else + src_hdr->transl[dict][i] = -1; + } + } + if ( !src_hdr->ntransl ) + { + free(src_hdr->transl[0]); src_hdr->transl[0] = NULL; + free(src_hdr->transl[1]); src_hdr->transl[1] = NULL; + src_hdr->ntransl = -1; + } + if ( src_hdr->ntransl==-1 ) return 0; + } + bcf_unpack(line,BCF_UN_ALL); + + // CHROM + if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid]; + + // FILTER + for (i=0; id.n_flt; i++) + { + int src_id = line->d.flt[i]; + if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 ) + line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id]; + } + + // INFO + for (i=0; in_info; i++) + { + int src_id = line->d.info[i].key; + int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; + if ( dst_id<0 ) continue; + int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; + int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; + if ( src_size==dst_size ) // can overwrite + { + line->d.info[i].key = dst_id; + uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off; + if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; } + else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; } + else { *(uint32_t*)vptr = (uint32_t)dst_id; } + } + else // must realloc + { + bcf_info_t *info = &line->d.info[i]; + assert( !info->vptr_free ); + kstring_t str = {0,0,0}; + bcf_enc_int1(&str, dst_id); + info->vptr_off = str.l; + kputsn((char*)info->vptr, info->vptr_len, &str); + info->vptr = (uint8_t*)str.s + info->vptr_off; + info->vptr_free = 1; + info->key = dst_id; + line->d.shared_dirty |= BCF1_DIRTY_INF; + } + } + + // FORMAT + for (i=0; in_fmt; i++) + { + int src_id = line->d.fmt[i].id; + int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; + if ( dst_id<0 ) continue; + int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; + int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; + if ( src_size==dst_size ) // can overwrite + { + line->d.fmt[i].id = dst_id; + uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits) + if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; } + else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; } + else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; } + } + else // must realloc + { + bcf_fmt_t *fmt = &line->d.fmt[i]; + assert( !fmt->p_free ); + kstring_t str = {0,0,0}; + bcf_enc_int1(&str, dst_id); + fmt->p_off = str.l; + kputsn((char*)fmt->p, fmt->p_len, &str); + fmt->p = (uint8_t*)str.s + fmt->p_off; + fmt->p_free = 1; + fmt->id = dst_id; + line->d.indiv_dirty = 1; + } + } + return 0; +} + +bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) +{ + bcf_hdr_t *hout = bcf_hdr_init("r"); + char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL); + bcf_hdr_parse(hout, htxt); + free(htxt); + return hout; +} + +bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) +{ + int hlen; + char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen); + kstring_t str; + bcf_hdr_t *h; + str.l = str.m = 0; str.s = 0; + h = bcf_hdr_init("w"); + bcf_hdr_set_version(h,bcf_hdr_get_version(h0)); + int j; + for (j=0; j 0) { + char *p; + int i = 0, end = n? 8 : 7; + while ((p = strstr(htxt, "#CHROM\t")) != 0) + if (p > htxt && *(p-1) == '\n') break; + while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p; + if (i != end) { + free(h); free(str.s); + return 0; // malformated header + } + kputsn(htxt, p - htxt, &str); + for (i = 0; i < n; ++i) { + imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]); + if (imap[i] < 0) continue; + kputc('\t', &str); + kputs(samples[i], &str); + } + } else kputsn(htxt, hlen, &str); + while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines + kputc('\n',&str); + bcf_hdr_parse(h, str.s); + free(str.s); + free(htxt); + return h; +} + +int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) +{ + if ( samples && !strcmp("-",samples) ) return 0; // keep all samples + + hdr->nsamples_ori = bcf_hdr_nsamples(hdr); + if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples + + int i, narr = bit_array_size(bcf_hdr_nsamples(hdr)); + hdr->keep_samples = (uint8_t*) calloc(narr,1); + if ( samples[0]=='^' ) + for (i=0; ikeep_samples,i); + + int idx, n, ret = 0; + char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n); + if ( !smpls ) return -1; + for (i=0; ikeep_samples, idx); + else + bit_array_set(hdr->keep_samples, idx); + } + for (i=0; insamples_ori; i++) + if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++; + if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; } + else + { + char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr)); + idx = 0; + for (i=0; insamples_ori; i++) + if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]); + free(hdr->samples); + hdr->samples = samples; + + // delete original samples from the dictionary + vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; + int k; + for (k = kh_begin(d); k != kh_end(d); ++k) + if (kh_exist(d, k)) free((char*)kh_key(d, k)); + kh_destroy(vdict, d); + + // add the subset back + hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict); + for (i=0; isamples[i], &ignore); + kh_val(d, k) = bcf_idinfo_def; + kh_val(d, k).id = kh_size(d) - 1; + } + bcf_hdr_sync(hdr); + } + + return ret; +} + +int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) +{ + kstring_t ind; + ind.s = 0; ind.l = ind.m = 0; + if (n) { + bcf_fmt_t *fmt; + int i, j; + fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t)); + uint8_t *ptr = (uint8_t*)v->indiv.s; + for (i = 0; i < v->n_fmt; ++i) + ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]); + for (i = 0; i < (int)v->n_fmt; ++i) { + bcf_fmt_t *f = &fmt[i]; + bcf_enc_int1(&ind, f->id); + bcf_enc_size(&ind, f->n, f->type); + for (j = 0; j < n; ++j) + if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind); + } + for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i; + v->n_sample = i; + } else v->n_sample = 0; + if ( !v->n_sample ) v->n_fmt = 0; + free(v->indiv.s); + v->indiv = ind; + v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again + return 0; +} + +int bcf_is_snp(bcf1_t *v) +{ + int i; + bcf_unpack(v, BCF_UN_STR); + for (i = 0; i < v->n_allele; ++i) + if (strlen(v->d.allele[i]) != 1) break; + return i == v->n_allele; +} + +static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var) +{ + // The most frequent case + if ( !ref[1] && !alt[1] ) + { + if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; } + if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant + var->n = 1; var->type = VCF_SNP; return; + } + + const char *r = ref, *a = alt; + while (*r && *a && *r==*a ) { r++; a++; } + + if ( *a && !*r ) + { + while ( *a ) a++; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + } + else if ( *r && !*a ) + { + while ( *r ) r++; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + } + else if ( !*r && !*a ) + { + var->n = 0; var->type = VCF_REF; return; + } + + const char *re = r, *ae = a; + while ( re[1] ) re++; + while ( ae[1] ) ae++; + while ( *re==*ae && re>r && ae>a ) { re--; ae--; } + if ( ae==a ) + { + if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } + var->n = -(re-r); + if ( *re==*ae ) { var->type = VCF_INDEL; return; } + var->type = VCF_OTHER; return; + } + else if ( re==r ) + { + var->n = ae-a; + if ( *re==*ae ) { var->type = VCF_INDEL; return; } + var->type = VCF_OTHER; return; + } + + var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER; + var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1; + + // should do also complex events, SVs, etc... +} + +static void bcf_set_variant_types(bcf1_t *b) +{ + if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR); + bcf_dec_t *d = &b->d; + if ( d->n_var < b->n_allele ) + { + d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele); + d->n_var = b->n_allele; + } + int i; + b->d.var_type = 0; + for (i=1; in_allele; i++) + { + bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]); + b->d.var_type |= d->var[i].type; + //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type); + } +} + +int bcf_get_variant_types(bcf1_t *rec) +{ + if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); + return rec->d.var_type; +} +int bcf_get_variant_type(bcf1_t *rec, int ith_allele) +{ + if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); + return rec->d.var[ith_allele].type; +} + +int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) +{ + // Is the field already present? + int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header + if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); + + for (i=0; in_info; i++) + if ( inf_id==line->d.info[i].key ) break; + bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i]; + + if ( !n || (type==BCF_HT_STR && !values) ) + { + if ( inf ) + { + // Mark the tag for removal, free existing memory if necessary + if ( inf->vptr_free ) + { + free(inf->vptr - inf->vptr_off); + inf->vptr_free = 0; + } + line->d.shared_dirty |= BCF1_DIRTY_INF; + inf->vptr = NULL; + } + return 0; + } + + // Encode the values and determine the size required to accommodate the values + kstring_t str = {0,0,0}; + bcf_enc_int1(&str, inf_id); + if ( type==BCF_HT_INT ) + bcf_enc_vint(&str, n, (int32_t*)values, -1); + else if ( type==BCF_HT_REAL ) + bcf_enc_vfloat(&str, n, (float*)values); + else if ( type==BCF_HT_FLAG || type==BCF_HT_STR ) + { + if ( values==NULL ) + bcf_enc_size(&str, 0, BCF_BT_NULL); + else + bcf_enc_vchar(&str, strlen((char*)values), (char*)values); + } + else + { + fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type); + abort(); + } + + // Is the INFO tag already present + if ( inf ) + { + // Is it big enough to accommodate new block? + if ( str.l <= inf->vptr_len + inf->vptr_off ) + { + if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; + uint8_t *ptr = inf->vptr - inf->vptr_off; + memcpy(ptr, str.s, str.l); + free(str.s); + int vptr_free = inf->vptr_free; + bcf_unpack_info_core1(ptr, inf); + inf->vptr_free = vptr_free; + } + else + { + assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before + bcf_unpack_info_core1((uint8_t*)str.s, inf); + inf->vptr_free = 1; + line->d.shared_dirty |= BCF1_DIRTY_INF; + } + } + else + { + // The tag is not present, create new one + line->n_info++; + hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info); + inf = &line->d.info[line->n_info-1]; + bcf_unpack_info_core1((uint8_t*)str.s, inf); + inf->vptr_free = 1; + line->d.shared_dirty |= BCF1_DIRTY_INF; + } + line->unpacked |= BCF_UN_INFO; + return 0; +} + +int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) +{ + if ( !n ) + return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR); + + int i, max_len = 0; + for (i=0; i max_len ) max_len = len; + } + char *out = (char*) malloc(max_len*n); + if ( !out ) return -2; + for (i=0; iunpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==fmt_id ) break; + bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i]; + + if ( !n ) + { + if ( fmt ) + { + // Mark the tag for removal, free existing memory if necessary + if ( fmt->p_free ) + { + free(fmt->p - fmt->p_off); + fmt->p_free = 0; + } + line->d.indiv_dirty = 1; + fmt->p = NULL; + } + return 0; + } + + line->n_sample = bcf_hdr_nsamples(hdr); + int nps = n / line->n_sample; // number of values per sample + assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample + + // Encode the values and determine the size required to accommodate the values + kstring_t str = {0,0,0}; + bcf_enc_int1(&str, fmt_id); + if ( type==BCF_HT_INT ) + bcf_enc_vint(&str, n, (int32_t*)values, nps); + else if ( type==BCF_HT_REAL ) + { + bcf_enc_size(&str, nps, BCF_BT_FLOAT); + kputsn((char*)values, nps*line->n_sample*sizeof(float), &str); + } + else if ( type==BCF_HT_STR ) + { + bcf_enc_size(&str, nps, BCF_BT_CHAR); + kputsn((char*)values, nps*line->n_sample, &str); + } + else + { + fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type); + abort(); + } + + if ( !fmt ) + { + // Not present, new format field + line->n_fmt++; + hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt); + + // Special case: VCF specification requires that GT is always first + if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] ) + { + for (i=line->n_fmt-1; i>0; i--) + line->d.fmt[i] = line->d.fmt[i-1]; + fmt = &line->d.fmt[0]; + } + else + fmt = &line->d.fmt[line->n_fmt-1]; + bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); + line->d.indiv_dirty = 1; + fmt->p_free = 1; + } + else + { + // The tag is already present, check if it is big enough to accomodate the new block + if ( str.l <= fmt->p_len + fmt->p_off ) + { + // good, the block is big enough + if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; + uint8_t *ptr = fmt->p - fmt->p_off; + memcpy(ptr, str.s, str.l); + free(str.s); + int p_free = fmt->p_free; + bcf_unpack_fmt_core1(ptr, line->n_sample, fmt); + fmt->p_free = p_free; + } + else + { + assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before + bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); + fmt->p_free = 1; + line->d.indiv_dirty = 1; + } + } + line->unpacked |= BCF_UN_FMT; + return 0; +} + + +int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) +{ + if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); + line->d.shared_dirty |= BCF1_DIRTY_FLT; + line->d.n_flt = n; + if ( !n ) return 0; + hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); + int i; + for (i=0; id.flt[i] = flt_ids[i]; + return 0; +} + +int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) +{ + if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); + int i; + for (i=0; id.n_flt; i++) + if ( flt_id==line->d.flt[i] ) break; + if ( id.n_flt ) return 0; // this filter is already set + line->d.shared_dirty |= BCF1_DIRTY_FLT; + if ( flt_id==0 ) // set to PASS + line->d.n_flt = 1; + else if ( line->d.n_flt==1 && line->d.flt[0]==0 ) + line->d.n_flt = 1; + else + line->d.n_flt++; + hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); + line->d.flt[line->d.n_flt-1] = flt_id; + return 1; +} +int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass) +{ + if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); + int i; + for (i=0; id.n_flt; i++) + if ( flt_id==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) return 0; // the filter is not present + line->d.shared_dirty |= BCF1_DIRTY_FLT; + if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,line->d.n_flt-i); + line->d.n_flt--; + if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0); + return 0; +} + +int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) +{ + if ( filter[0]=='.' && !filter[1] ) filter = "PASS"; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header + + if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); + if ( id==0 && !line->d.n_flt) return 1; // PASS + + int i; + for (i=0; id.n_flt; i++) + if ( line->d.flt[i]==id ) return 1; + return 0; +} + +static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals) +{ + line->d.shared_dirty |= BCF1_DIRTY_ALS; + + line->n_allele = nals; + hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele); + + char *als = line->d.als; + int n = 0; + while (nd.allele[n] = als; + while ( *als ) als++; + als++; + n++; + } + return 0; +} +int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) +{ + kstring_t tmp = {0,0,0}; + char *free_old = NULL; + + // If the supplied alleles are not pointers to line->d.als, the existing block can be reused. + int i; + for (i=0; i=line->d.als && alleles[i]d.als+line->d.m_als ) break; + if ( i==nals ) + { + // all alleles point elsewhere, reuse the existing block + tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; + } + else + free_old = line->d.als; + + for (i=0; id.als = tmp.s; line->d.m_als = tmp.m; + free(free_old); + return _bcf1_sync_alleles(hdr,line,nals); +} + +int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) +{ + kstring_t tmp; + tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; + kputs(alleles_string, &tmp); + line->d.als = tmp.s; line->d.m_als = tmp.m; + + int nals = 1; + char *t = line->d.als; + while (*t) + { + if ( *t==',' ) { *t = 0; nals++; } + t++; + } + return _bcf1_sync_alleles(hdr, line, nals); +} + +int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) +{ + kstring_t tmp; + tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id; + if ( id ) + kputs(id, &tmp); + else + kputs(".", &tmp); + line->d.id = tmp.s; line->d.m_id = tmp.m; + line->d.shared_dirty |= BCF1_DIRTY_ID; + return 0; +} + +bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) +{ + int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + for (i=0; in_fmt; i++) + { + if ( line->d.fmt[i].id==id ) return &line->d.fmt[i]; + } + return NULL; +} + +bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) +{ + int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header + if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); + for (i=0; in_info; i++) + { + if ( line->d.info[i].key==id ) return &line->d.info[i]; + } + return NULL; +} + +int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) +{ + int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type + + if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); + + for (i=0; in_info; i++) + if ( line->d.info[i].key==tag_id ) break; + if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record + if ( type==BCF_HT_FLAG ) return 1; + + bcf_info_t *info = &line->d.info[i]; + if ( type==BCF_HT_STR ) + { + if ( *ndst < info->len+1 ) + { + *ndst = info->len + 1; + *dst = realloc(*dst, *ndst); + } + memcpy(*dst,info->vptr,info->len); + ((uint8_t*)*dst)[info->len] = 0; + return info->len; + } + + // Make sure the buffer is big enough + int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); + if ( *ndst < info->len ) + { + *ndst = info->len; + *dst = realloc(*dst, *ndst * size1); + } + + if ( info->len == 1 ) + { + if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f; + else *((int32_t*)*dst) = info->v1.i; + return 1; + } + + #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \ + out_type_t *tmp = (out_type_t *) *dst; \ + type_t *p = (type_t *) info->vptr; \ + for (j=0; jlen; j++) \ + { \ + if ( is_vector_end ) return j; \ + if ( is_missing ) set_missing; \ + else *tmp = p[j]; \ + tmp++; \ + } \ + return j; \ + } + switch (info->type) { + case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break; + case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break; + case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break; + default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + } + #undef BRANCH + return -4; // this can never happen +} + +int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) +{ + int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header + if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type + + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==tag_id ) break; + if ( i==line->n_fmt ) return -3; // the tag is not present in this record + bcf_fmt_t *fmt = &line->d.fmt[i]; + + int nsmpl = bcf_hdr_nsamples(hdr); + if ( !*dst ) + { + *dst = (char**) malloc(sizeof(char*)*nsmpl); + if ( !*dst ) return -4; // could not alloc + (*dst)[0] = NULL; + } + int n = (fmt->n+1)*nsmpl; + if ( *ndst < n ) + { + (*dst)[0] = realloc((*dst)[0], n); + if ( !(*dst)[0] ) return -4; // could not alloc + *ndst = n; + } + for (i=0; ip + i*fmt->n; + uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1); + memcpy(tmp,src,fmt->n); + tmp[fmt->n] = 0; + (*dst)[i] = (char*) tmp; + } + return n; +} + +int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) +{ + int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); + if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header + if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 ) + { + // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. + if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; + } + else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type + + if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); + + for (i=0; in_fmt; i++) + if ( line->d.fmt[i].id==tag_id ) break; + if ( i==line->n_fmt ) return -3; // the tag is not present in this record + bcf_fmt_t *fmt = &line->d.fmt[i]; + + if ( type==BCF_HT_STR ) + { + int n = fmt->n*bcf_hdr_nsamples(hdr); + if ( *ndst < n ) + { + *dst = realloc(*dst, n); + if ( !*dst ) return -4; // could not alloc + *ndst = n; + } + memcpy(*dst,fmt->p,n); + return n; + } + + // Make sure the buffer is big enough + int nsmpl = bcf_hdr_nsamples(hdr); + int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); + if ( *ndst < fmt->n*nsmpl ) + { + *ndst = fmt->n*nsmpl; + *dst = realloc(*dst, *ndst*size1); + if ( !dst ) return -4; // could not alloc + } + + #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \ + out_type_t *tmp = (out_type_t *) *dst; \ + type_t *p = (type_t*) fmt->p; \ + for (i=0; in; j++) \ + { \ + if ( is_missing ) set_missing; \ + else if ( is_vector_end ) { set_vector_end; break; } \ + else *tmp = p[j]; \ + tmp++; \ + } \ + for (; jn; j++) { set_vector_end; tmp++; } \ + p = (type_t *)((char *)p + fmt->size); \ + } \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; + case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break; + default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1); + } + #undef BRANCH + return nsmpl*fmt->n; +} + diff --git a/star-sys/STAR/source/htslib/vcf_sweep.c b/star-sys/STAR/source/htslib/vcf_sweep.c new file mode 100644 index 0000000..2598460 --- /dev/null +++ b/star-sys/STAR/source/htslib/vcf_sweep.c @@ -0,0 +1,158 @@ +#include "htslib/vcf_sweep.h" +#include "htslib/bgzf.h" + +#define SW_FWD 0 +#define SW_BWD 1 + +struct _bcf_sweep_t +{ + htsFile *file; + bcf_hdr_t *hdr; + BGZF *fp; + + int direction; // to tell if the direction has changed + int block_size; // the size of uncompressed data to hold in memory + bcf1_t *rec; // bcf buffer + int nrec, mrec; // number of used records; total size of the buffer + int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record + char *lals; + + uint64_t *idx; // uncompressed offsets of VCF/BCF records + int iidx, nidx, midx; // i: current offset; n: used; m: allocated + int idx_done; // the index is built during the first pass +}; + +BGZF *hts_get_bgzfp(htsFile *fp); +int hts_useek(htsFile *file, long uoffset, int where); +long hts_utell(htsFile *file); + +static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec) +{ + if ( sw->lrid!=rec->rid ) return 0; + if ( sw->lpos!=rec->pos ) return 0; + if ( sw->lnals!=rec->n_allele ) return 0; + + char *t = rec->d.allele[sw->lnals-1]; + int len = t - rec->d.allele[0] + 1; + while ( *t ) { t++; len++; } + if ( sw->lals_len!=len ) return 0; + if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0; + return 1; +} + +static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) +{ + sw->lrid = rec->rid; + sw->lpos = rec->pos; + sw->lnals = rec->n_allele; + + char *t = rec->d.allele[sw->lnals-1]; + int len = t - rec->d.allele[0] + 1; + while ( *t ) { t++; len++; } + sw->lals_len = len; + hts_expand(char, len, sw->mlals, sw->lals); + memcpy(sw->lals, rec->d.allele[0], len); +} + +static void sw_fill_buffer(bcf_sweep_t *sw) +{ + if ( !sw->iidx ) return; + sw->iidx--; + + int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); + assert( ret==0 ); + + sw->nrec = 0; + bcf1_t *rec = &sw->rec[sw->nrec]; + while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) + { + bcf_unpack(rec, BCF_UN_STR); + + // if not in the last block, stop at the saved record + if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; + + sw->nrec++; + hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); + rec = &sw->rec[sw->nrec]; + } + sw_rec_save(sw, &sw->rec[0]); +} + +bcf_sweep_t *bcf_sweep_init(const char *fname) +{ + bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t)); + sw->file = hts_open(fname, "r"); + sw->fp = hts_get_bgzfp(sw->file); + bgzf_index_build_init(sw->fp); + sw->hdr = bcf_hdr_read(sw->file); + sw->mrec = 1; + sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t))); + sw->block_size = 1024*1024*3; + sw->direction = SW_FWD; + return sw; +} + +void bcf_empty1(bcf1_t *v); +void bcf_sweep_destroy(bcf_sweep_t *sw) +{ + int i; + for (i=0; imrec; i++) bcf_empty1(&sw->rec[i]); + free(sw->idx); + free(sw->rec); + free(sw->lals); + bcf_hdr_destroy(sw->hdr); + hts_close(sw->file); + free(sw); +} + +static void sw_seek(bcf_sweep_t *sw, int direction) +{ + sw->direction = direction; + if ( direction==SW_FWD ) + hts_useek(sw->file, sw->idx[0], 0); + else + { + sw->iidx = sw->nidx; + sw->nrec = 0; + } +} + +bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw) +{ + if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD); + + long pos = hts_utell(sw->file); + + bcf1_t *rec = &sw->rec[0]; + int ret = bcf_read1(sw->file, sw->hdr, rec); + + if ( ret!=0 ) // last record, get ready for sweeping backwards + { + sw->idx_done = 1; + sw->fp->idx_build_otf = 0; + sw_seek(sw, SW_BWD); + return NULL; + } + + if ( !sw->idx_done ) + { + if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size ) + { + sw->nidx++; + hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx); + sw->idx[sw->nidx-1] = pos; + } + } + return rec; +} + +bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw) +{ + if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD); + if ( !sw->nrec ) sw_fill_buffer(sw); + if ( !sw->nrec ) return NULL; + return &sw->rec[ --sw->nrec ]; +} + +bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; } + diff --git a/star-sys/STAR/source/htslib/vcfutils.c b/star-sys/STAR/source/htslib/vcfutils.c new file mode 100644 index 0000000..a4a9c2f --- /dev/null +++ b/star-sys/STAR/source/htslib/vcfutils.c @@ -0,0 +1,642 @@ +#include "htslib/vcfutils.h" + +int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) +{ + int i; + for (i=0; in_allele; i++) ac[i]=0; + + // Use INFO/AC,AN field only when asked + if ( which&BCF_UN_INFO ) + { + bcf_unpack(line, BCF_UN_INFO); + int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); + int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); + int i, an=-1, ac_len=0, ac_type=0; + uint8_t *ac_ptr=NULL; + if ( an_id>=0 && ac_id>=0 ) + { + for (i=0; in_info; i++) + { + bcf_info_t *z = &line->d.info[i]; + if ( z->key == an_id ) an = z->v1.i; + else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } + } + } + if ( an>=0 && ac_ptr ) + { + int nac = 0; + #define BRANCH_INT(type_t) { \ + type_t *p = (type_t *) ac_ptr; \ + for (i=0; iid[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + } + #undef BRANCH_INT + assert( an>=nac ); // sanity check for missing values + ac[0] = an - nac; + return 1; + } + } + + // Split genotype fields only when asked + if ( which&BCF_UN_FMT ) + { + int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); + if ( gt_id<0 ) return 0; + bcf_unpack(line, BCF_UN_FMT); + bcf_fmt_t *fmt_gt = NULL; + for (i=0; i<(int)line->n_fmt; i++) + if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } + if ( !fmt_gt ) return 0; + #define BRANCH_INT(type_t,missing,vector_end) { \ + for (i=0; in_sample; i++) \ + { \ + type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ + int ial; \ + for (ial=0; ialn; ial++) \ + { \ + if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ + if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ + ac[(p[ial]>>1)-1]++; \ + } \ + } \ + } + switch (fmt_gt->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + } + #undef BRANCH_INT + return 1; + } + return 0; +} + +int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) +{ + int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; + #define BRANCH_INT(type_t,missing,vector_end) { \ + type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ + for (i=0; in; i++) \ + { \ + if ( p[i] == vector_end ) break; /* smaller ploidy */ \ + if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \ + int tmp = p[i]>>1; \ + if ( tmp>1 ) \ + { \ + if ( !ial ) { ial = tmp; has_alt = 1; } \ + else if ( tmp!=ial ) \ + { \ + if ( tmptype) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; + } + #undef BRANCH_INT + + if ( _ial ) *_ial = ial>0 ? ial-1 : ial; + if ( _jal ) *_jal = jal>0 ? jal-1 : jal; + if ( !nals ) return GT_UNKN; + if ( nals==1 ) + return has_ref ? GT_HAPL_R : GT_HAPL_A; + if ( !has_ref ) + return has_alt==1 ? GT_HOM_AA : GT_HET_AA; + if ( !has_alt ) + return GT_HOM_RR; + return GT_HET_RA; +} + +int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) +{ + int i; + bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT"); + if ( !gt ) return 0; + + int *ac = (int*) calloc(line->n_allele,sizeof(int)); + + // check if all alleles are populated + #define BRANCH(type_t,missing,vector_end) { \ + for (i=0; in_sample; i++) \ + { \ + type_t *p = (type_t*) (gt->p + i*gt->size); \ + int ial; \ + for (ial=0; ialsize; ial++) \ + { \ + if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ + if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ + assert((p[ial]>>1)-1n_allele); \ + ac[(p[ial]>>1)-1]++; \ + } \ + } \ + } + switch (gt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + } + #undef BRANCH + + int rm_als = 0, nrm = 0; + for (i=1; in_allele; i++) + { + if ( !ac[i] ) { rm_als |= 1<n_allele, sizeof(int)); + + // create map of indexes from old to new ALT numbering and modify ALT + kstring_t str = {0,0,0}; + kputs(line->d.allele[0], &str); + + int nrm = 0, i,j; // i: ori alleles, j: new alleles + for (i=1, j=1; in_allele; i++) + { + if ( rm_mask & 1<d.allele[i] = NULL; + nrm++; + continue; + } + kputc(',', &str); + kputs(line->d.allele[i], &str); + map[i] = j; + j++; + } + if ( !nrm ) { free(map); free(str.s); return; } + + int nR_ori = line->n_allele; + int nR_new = line->n_allele-nrm; + assert(nR_new > 0); // should not be able to remove reference allele + int nA_ori = nR_ori-1; + int nA_new = nR_new-1; + + int nG_ori = nR_ori*(nR_ori + 1)/2; + int nG_new = nR_new*(nR_new + 1)/2; + + bcf_update_alleles_str(header, line, str.s); + + // remove from Number=G, Number=R and Number=A INFO fields. + uint8_t *dat = NULL; + int mdat = 0, ndat = 0, mdat_bytes = 0, nret; + for (i=0; in_info; i++) + { + bcf_info_t *info = &line->d.info[i]; + int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); + + if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change + + int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); + if ( type==BCF_HT_FLAG ) continue; + int size = 1; + if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; + + mdat = mdat_bytes / size; + nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); + mdat_bytes = mdat * size; + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + if ( type==BCF_HT_STR ) + { + str.l = 0; + char *ss = (char*) dat, *se = (char*) dat; + if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) + { + int nexp, inc = 0; + if ( vlen==BCF_VL_A ) + { + nexp = nA_ori; + inc = 1; + } + else + nexp = nR_ori; + for (j=0; jkey), (void*)str.s, str.l, type); + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + continue; + } + + if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) + { + int inc = 0, ntop; + if ( vlen==BCF_VL_A ) + { + assert( nret==nA_ori ); + ntop = nA_ori; + ndat = nA_new; + inc = 1; + } + else + { + assert( nret==nR_ori ); + ntop = nR_ori; + ndat = nR_new; + } + int k = 0; + + #define BRANCH(type_t,is_vector_end) \ + { \ + type_t *ptr = (type_t*) dat; \ + int size = sizeof(type_t); \ + for (j=0; jkey), (void*)dat, ndat, type); + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + } + + // Update GT fields, the allele indexes might have changed + for (i=1; in_allele; i++) if ( map[i]!=i ) break; + if ( in_allele ) + { + mdat = mdat_bytes / 4; // sizeof(int32_t) + nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); + mdat_bytes = mdat * 4; + if ( nret>0 ) + { + nret /= line->n_sample; + int32_t *ptr = (int32_t*) dat; + for (i=0; in_sample; i++) + { + for (j=0; j=0 ); + ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); + } + ptr += nret; + } + bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); + } + } + + // Remove from Number=G, Number=R and Number=A FORMAT fields. + // Assuming haploid or diploid GTs + for (i=0; in_fmt; i++) + { + bcf_fmt_t *fmt = &line->d.fmt[i]; + int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); + + if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change + + int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); + if ( type==BCF_HT_FLAG ) continue; + + int size = 1; + if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; + + mdat = mdat_bytes / size; + nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); + mdat_bytes = mdat * size; + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + + if ( type==BCF_HT_STR ) + { + int size = nret/line->n_sample; // number of bytes per sample + str.l = 0; + if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) + { + int nexp, inc = 0; + if ( vlen==BCF_VL_A ) + { + nexp = nA_ori; + inc = 1; + } + else + nexp = nR_ori; + for (j=0; jn_sample; j++) + { + char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; + int k_src = 0, k_dst = 0, l = str.l; + for (k_src=0; k_src=se || !*ptr) break; + while ( ptrn_sample; j++) + { + char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; + int k_src = 0, k_dst = 0, l = str.l; + int nexp = 0; // diploid or haploid? + while ( ptr=se || !*ptr ) break; + while ( ptr=se || !*ptr ) break; + } + } + else // haploid + { + for (k_src=0; k_src=se || !*ptr ) break; + while ( ptrid), (void*)str.s, str.l, type); + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + continue; + } + + int nori = nret / line->n_sample; + if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G + { + int ntop, inc = 0; + if ( vlen==BCF_VL_A ) + { + assert( nori==nA_ori ); // todo: will fail if all values are missing + ntop = nA_ori; + ndat = nA_new*line->n_sample; + inc = 1; + } + else + { + assert( nori==nR_ori ); // todo: will fail if all values are missing + ntop = nR_ori; + ndat = nR_new*line->n_sample; + } + + #define BRANCH(type_t,is_vector_end) \ + { \ + for (j=0; jn_sample; j++) \ + { \ + type_t *ptr_src = ((type_t*)dat) + j*nori; \ + type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ + int size = sizeof(type_t); \ + int k_src, k_dst = 0; \ + for (k_src=0; k_srcn_sample; + + #define BRANCH(type_t,is_vector_end) \ + { \ + for (j=0; jn_sample; j++) \ + { \ + type_t *ptr_src = ((type_t*)dat) + j*nori; \ + type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ + int size = sizeof(type_t); \ + int ia, ib, k_dst = 0, k_src; \ + int nset = 0; /* haploid or diploid? */ \ + for (k_src=0; k_srcid), (void*)dat, ndat, type); + if ( nret<0 ) + { + fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); + exit(1); + } + } + free(dat); + free(str.s); + free(map); +} + diff --git a/star-sys/STAR/source/insertSeqSA.cpp b/star-sys/STAR/source/insertSeqSA.cpp new file mode 100644 index 0000000..0c87bd6 --- /dev/null +++ b/star-sys/STAR/source/insertSeqSA.cpp @@ -0,0 +1,316 @@ +/* + * inserts sequences into the SA + * returns number of SA indexes inserted + */ +#include "insertSeqSA.h" +#include "ErrorWarning.h" +#include "SuffixArrayFuns.h" +#include "SequenceFuns.h" +#include "serviceFuns.cpp" +#include "streamFuns.h" +#include "binarySearch2.h" +#include "funCompareUintAndSuffixes.h" +#include "funCompareUintAndSuffixesMemcmp.h" +#include +#include "genomeSAindex.h" +#include "sortSuffixesBucket.h" + +uint insertSeqSA(PackedArray & SA, PackedArray & SA1, PackedArray & SAi, char * G, char * G1, uint64 nG, uint64 nG1, uint64 nG2, Parameters & P, Genome &mapGen) +{//insert new sequences into the SA + + uint GstrandBit1 = (uint) floor(log(nG+nG1)/log(2))+1; + if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA + if ( GstrandBit1+1 != SA.wordLength) + {//sequence is too long - GstrandBit changed + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: cannot insert sequence on the fly because of strand GstrandBit problem\n"; + errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + }; + + uint N2bit= 1LLU << (SA.wordLength-1); + uint strandMask=~N2bit; + for (uint64 isa=0;isa0 ) + {//- strand + if ( (ind1 & strandMask)>=nG2 ) + {//the first nG bases + ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence + SA.writePacked(isa,ind1); + }; + } else + {//+ strand + if ( ind1>=nG ) + {//the last nG2 bases + ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence + SA.writePacked(isa,ind1); + }; + }; + }; + + char** seq1=new char*[2]; + + #define GENOME_endFillL 16 + char* seqq=new char [4*nG1+3*GENOME_endFillL];//ends shouldbe filled with 5 to mark boundaries + + seq1[0]=seqq+GENOME_endFillL;//TODO: avoid defining an extra array, use reverse search + seq1[1]=seqq+2*GENOME_endFillL+2*nG1; + + memset(seqq,GENOME_spacingChar,GENOME_endFillL); + memset(seqq+2*nG1+GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL); + memset(seqq+4*nG1+2*GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL); + + memcpy(seq1[0], G1, nG1); + for (uint ii=0; ii3) + {//no index for suffices starting with N + indArray[ii*2]=-1; + } else + { + indArray[ii*2] = suffixArraySearch1(mapGen, seq1, ii, 10000, nG, (iilogMain << timeMonthDayTime(rawtime) << " Finished SA search, number of new SA indices = "<logMain << timeMonthDayTime(rawtime) << " Finished qsort - old " <logMain << timeMonthDayTime(rawtime) << " Finished qsort" <logMain << timeMonthDayTime(rawtime) << " Finished qsort"<logMain << timeMonthDayTime(rawtime) << " Finished ordering suffixes"<logMain << timeMonthDayTime(rawtime) << " Finished sorting SA indices"<logMain << timeMonthDayTime(rawtime) << " Finished inserting SA indices" <0 ) +// {//index missing from the old genome +// uint iSeq1=iSeq; +// int64 ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL); +// while (ind1 < (int64)(ii-mapGen.genomeSAindexStart[iL]) && indArray[2*iSeq]= (int64) (ii-mapGen.genomeSAindexStart[iL]) ) {//this belongs to the next index +// break; +// }; +// ++iSeq; +// }; +// +// SAi.writePacked(ii,iSA1+iSeq); +// +// for (uint ii0=ind0+1; ii03) {//this iSA contains N, need to mark the previous +// for (uint iL1=iL; iL1 < P.mapGen.gSAindexNbases; iL1++) { +// ind1+=3; +// int64 ind2=mapGen.genomeSAindexStart[iL1]+ind1; +// for (; ind2>=0; ind2--) {//find previous index that is not absent +// if ( (SAi[ind2] & mapGen.SAiMarkAbsentMaskC)==0 ) { +// break; +// }; +// }; +// SAi.writePacked(ind2,SAi[ind2] | mapGen.SAiMarkNmaskC); +// ind1 <<= 2; +// }; +// break; +// } else { +// ind1 += g; +// }; +// }; +// }; +// time ( &rawtime ); +// P.inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <logMain << timeMonthDayTime(rawtime) << " Finished SAi" < +#include + + +#define GTF_exonLoci_size 4 +#define GTF_exonTrID(ii) ((ii)*GTF_exonLoci_size) +#define GTF_exonStart(ii) ((ii)*GTF_exonLoci_size+1) +#define GTF_exonEnd(ii) ((ii)*GTF_exonLoci_size+2) +#define GTF_exonGeID(ii) ((ii)*GTF_exonLoci_size+3) + +#define GTF_extrLoci_size 6 +#define GTF_extrTrStart(ii) ((ii)*GTF_extrLoci_size) +#define GTF_extrTrEnd(ii) ((ii)*GTF_extrLoci_size+1) +#define GTF_extrTrID(ii) ((ii)*GTF_extrLoci_size+2) +#define GTF_extrExStart(ii) ((ii)*GTF_extrLoci_size+3) +#define GTF_extrExEnd(ii) ((ii)*GTF_extrLoci_size+4) +#define GTF_extrGeID(ii) ((ii)*GTF_extrLoci_size+5) + +#define GTF_exgeLoci_size 5 +#define GTF_exgeExStart(ii) ((ii)*GTF_exgeLoci_size+0) +#define GTF_exgeExEnd(ii) ((ii)*GTF_exgeLoci_size+1) +#define GTF_exgeExStrand(ii) ((ii)*GTF_exgeLoci_size+2) +#define GTF_exgeGeID(ii) ((ii)*GTF_exgeLoci_size+3) +#define GTF_exgeTrID(ii) ((ii)*GTF_exgeLoci_size+4) + + + +uint64 loadGTF(SjdbClass &sjdbLoci, Parameters &P, string dirOut, Genome &mapGen) {//load gtf file, add junctions to P.sjdb + //returns number of added junctions + if (mapGen.sjdbOverhang==0 || mapGen.pGe.sjdbGTFfile=="-") //no GTF + return 0; + + time_t rawTime; + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... processing annotations GTF\n" <logStdOut << timeMonthDayTime(rawTime) <<" ..... processing annotations GTF\n" <> geneAttr; + + ifstream sjdbStreamIn ( mapGen.pGe.sjdbGTFfile.c_str() ); + if (sjdbStreamIn.fail()) { + ostringstream errOut; + errOut << "FATAL error, could not open file pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<"\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + + if (mapGen.chrNameIndex.size()==0) + { + for (uint64 ii=0;ii transcriptIDnumber, geneIDnumber; + + uint64 exonN=0; + while (sjdbStreamIn.good()) {//count the number of exons + string chr1,ddd2,featureType; + sjdbStreamIn >> chr1 >> ddd2 >> featureType; + if (chr1.substr(0,1)!="#" && featureType==mapGen.pGe.sjdbGTFfeatureExon) { + exonN++; + }; + sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line + }; + + if (exonN==0) { + ostringstream errOut; + errOut << "Fatal INPUT FILE error, no ""exon"" lines in the GTF file: " << mapGen.pGe.sjdbGTFfile <<"\n"; + errOut << "Solution: check the formatting of the GTF file, it must contain some lines with ""exon"" in the 3rd column.\n"; + errOut << " Make sure the GTF file is unzipped.\n"; + errOut << " If exons are marked with a different word, use --sjdbGTFfeatureExon .\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + + uint64* exonLoci=new uint64 [exonN*GTF_exonLoci_size]; + char* transcriptStrand = new char [exonN]; + vector transcriptID, geneID; + + exonN=0;//re-calculate + sjdbStreamIn.clear(); + sjdbStreamIn.seekg(0,ios::beg); + while (sjdbStreamIn.good()) { + + string oneLine,chr1,ddd2,featureType; + getline(sjdbStreamIn,oneLine); + istringstream oneLineStream (oneLine); + + oneLineStream >> chr1 >> ddd2 >> featureType; + if (chr1.substr(0,1)!="#" && featureType==mapGen.pGe.sjdbGTFfeatureExon) {//exonic line, process + + if (mapGen.pGe.sjdbGTFchrPrefix!="-") chr1=mapGen.pGe.sjdbGTFchrPrefix + chr1; + + if (mapGen.chrNameIndex.count(chr1)==0) {//chr not in Genome + P.inOut->logMain << "WARNING: while processing pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<": chromosome '"<logMain << oneLine <<"\n"<> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last + + string oneLine1; + getline(oneLineStream, oneLine1);//get the last field + replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes + replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing + replace(oneLine1.begin(),oneLine1.end(),'\t',' ');//replace tabs + replace(oneLine1.begin(),oneLine1.end(),'\"',' ');//now the only separator is space + + //string trID(""), gID(""), attr1(""),gName(""),gBiotype(""); + vector> exAttrNames({ {mapGen.pGe.sjdbGTFtagExonParentTranscript}, {mapGen.pGe.sjdbGTFtagExonParentGene}, mapGen.pGe.sjdbGTFtagExonParentGeneName, mapGen.pGe.sjdbGTFtagExonParentGeneType }); //trID, gID, gName, gBiotype + vector exAttr; //trID, gID, gName, gBiotype + exAttr.resize(exAttrNames.size()); + + for (uint32 ii=0; iilogMain << "WARNING: while processing pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<": no transcript_id for line:\n"; + P.inOut->logMain << oneLine <<"\n"<logMain << "WARNING: while processing pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<": no gene_id for line:\n"; + P.inOut->logMain << oneLine <<"\n"< (exAttr[0],(uint64) transcriptIDnumber.size()));//insert new element if necessary with a new numeric value + if (transcriptID.size() < transcriptIDnumber.size()) transcriptID.push_back(exAttr[0]); + if (str1=='+') { + transcriptStrand[transcriptIDnumber[exAttr[0]]]=1; + } else if (str1=='-') { + transcriptStrand[transcriptIDnumber[exAttr[0]]]=2; + } else { + transcriptStrand[transcriptIDnumber[exAttr[0]]]=0; + }; + + geneIDnumber.insert(std::pair (exAttr[1],(uint64) geneIDnumber.size()));//insert new element if necessary with a $ + if (geneID.size() < geneIDnumber.size()) {//new gene is added + geneID.push_back(exAttr[1]); + geneAttr.push_back({exAttr[2],exAttr[3]}); + }; + + exonLoci[GTF_exonTrID(exonN)]=transcriptIDnumber[exAttr[0]]; + exonLoci[GTF_exonStart(exonN)]=ex1+mapGen.chrStart[mapGen.chrNameIndex[chr1]]-1; + exonLoci[GTF_exonEnd(exonN)]=ex2+mapGen.chrStart[mapGen.chrNameIndex[chr1]]-1; + exonLoci[GTF_exonGeID(exonN)]=geneIDnumber[exAttr[1]]; + ++exonN; + };//if (chr1.substr(0,1)!="#" && featureType=="exon") + };// + + if (exonN==0) { + ostringstream errOut; + errOut << "Fatal INPUT FILE error, no valid ""exon"" lines in the GTF file: " << mapGen.pGe.sjdbGTFfile <<"\n"; + errOut << "Solution: check the formatting of the GTF file. Most likely cause is the difference in chromosome naming between GTF and FASTA file.\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + //sort exonLoci by transcript ID and exon coordinates + qsort((void*) exonLoci, exonN, sizeof(uint64)*GTF_exonLoci_size, funCompareUint2); + + {//exon-gene data structures: exon start/end/strand/gene/transcript + //re-sort exons by exons loci + uint64* exgeLoci=new uint64 [exonN*GTF_exgeLoci_size]; //this also contains transcripts start and end + + for (uint64 iex=0; iex); + + ofstream & exgeOut = ofstrOpen(dirOut+"/exonGeTrInfo.tab",ERROR_OUT,P); + exgeOut<); + + ofstream trOut ((dirOut+"/transcriptInfo.tab").c_str()); + trOut<> mapGen.pGe.gChrBinNbits]; + if ( exonLoci[GTF_exonStart(exI)]<=exonLoci[GTF_exonEnd(exI-1)]+1 ) { + P.inOut->logMain << "WARNING: while processing pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<": overlapping or touching exons:\n"; + P.inOut->logMain << mapGen.chrName[chr1] <<"\t"<< exonLoci[GTF_exonStart(exI-1)]+1-mapGen.chrStart[chr1] << "\t"<< exonLoci[GTF_exonEnd(exI-1)]+1-mapGen.chrStart[chr1] <<"\n"; + P.inOut->logMain << mapGen.chrName[chr1] <<"\t"<< exonLoci[GTF_exonStart(exI)]+1-mapGen.chrStart[chr1] << "\t"<< exonLoci[GTF_exonEnd(exI)]+1-mapGen.chrStart[chr1] <<"\n"; + } else { + sjLoci[sjN*sjStride]=exonLoci[GTF_exonEnd(exI-1)]+1; + sjLoci[sjN*sjStride+1]=exonLoci[GTF_exonStart(exI)]-1; + sjLoci[sjN*sjStride+2]=(uint64) transcriptStrand[trIDn]; + sjLoci[sjN*sjStride+3]=exonLoci[GTF_exonGeID(exI)]+1;//genes are numbered from 1 + sjN++; + }; + } else { + trIDn=exonLoci[GTF_exonTrID(exI)]; + }; + }; + + qsort((void*) sjLoci, sjN, sizeof(uint64)*sjStride, funCompareUint2); + + char strandChar[3]={'.','+','-'}; + uint64 sjdbN1=sjdbLoci.chr.size(); + sjdbLoci.gene.resize(sjdbN1); //need to resize in case sjdbLoci was loaded from files without gene attribute. TODO make sure gene is always present + for (uint64 ii=0;ii> mapGen.pGe.gChrBinNbits]; + sjdbLoci.chr.push_back(mapGen.chrName[chr1]); + sjdbLoci.start.push_back(sjLoci[ii*sjStride]+1-mapGen.chrStart[chr1]); + sjdbLoci.end.push_back(sjLoci[ii*sjStride+1]+1-mapGen.chrStart[chr1]); + sjdbLoci.str.push_back(strandChar[sjLoci[ii*sjStride+2]]); + sjdbLoci.gene.push_back({sjLoci[ii*sjStride+3]}); + } else { + sjdbLoci.gene.back().insert(sjLoci[ii*sjStride+3]); + }; + }; + + ofstream sjdbList ((dirOut+"/sjdbList.fromGTF.out.tab").c_str()); + for (uint64 ii=sjdbN1;iilogMain << "Processing pGe.sjdbGTFfile=" << mapGen.pGe.sjdbGTFfile <<", found:\n"; + P.inOut->logMain << "\t\t" << transcriptIDnumber.size() <<" transcripts\n" << "\t\t" << exonN << " exons (non-collapsed)\n" << "\t\t" << sjdbLoci.chr.size()-sjdbN1 << " collapsed junctions\n"; + time(&rawTime); + P.inOut->logMain << timeMonthDayTime(rawTime) <<" ..... finished GTF processing\n" <0) {//something went wrong with one of threads + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: phtread error while creating thread # " << ithread <<", error code: "<logMain, 1, P); + }; + pthread_mutex_lock(&g_threadChunks.mutexLogMain); + P.inOut->logMain << "Created thread # " <processChunks(); //start main thread + + for (int ithread=1;ithread0) {//something went wrong with one of threads + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: phtread error while joining thread # " << ithread <<", error code: "<logMain, 1, P); + }; + pthread_mutex_lock(&g_threadChunks.mutexLogMain); + P.inOut->logMain << "Joined thread # " <inputParameters(argInN, argIn); + //pMut->readNmates = 1; + Genome* gMut = new Genome(*pMut); + gMut->genomeLoad(); + gMut->Var = NULL; //new Variation(*pMut, gMut->chrStart, gMut->chrNameIndex); + p = pMut; + g = gMut; + } + ~StarRef() + { + delete g; + delete p; + } +}; + +struct Aligner +{ + public: + + // A pointer to the built index containing parameters and the + // reference genome + const StarRef *ref; + + // ra represents the ReadAlign object that is used to make any kind of + // alignment queries + ReadAlign *ra; + + // isOriginal is true iff the aligner is initialized with init() + // instead of init_clone(), and is used for deciding which members + // originated in this instance and can be safely freed upon destruction + int isOriginal; + + Aligner(const StarRef* r) + { + isOriginal = 0; + ref = r; + Transcriptome *mainTranscriptome = nullptr; + ra = new ReadAlign(*(ref->p), *(ref->g), mainTranscriptome, 0); + } + + Aligner(int argInN, char* argIn[]) + { + isOriginal = 1; + ref = new StarRef(argInN, argIn); + Transcriptome *mainTranscriptome = nullptr; + ra = new ReadAlign(*(ref->p), *(ref->g), mainTranscriptome, 0); + } + + // This constructor is used to construct clones of an existing Aligner + // This allows multi-threaded alignment without each thread + // constructing its own genome object + Aligner(const Aligner* og) + { + isOriginal = 0; + ref = og->ref; + Transcriptome *mainTranscriptome = nullptr; + ra = new ReadAlign(*(ref->p), *(ref->g), mainTranscriptome, 0); + } + + ~Aligner() + { + delete ra; + if(isOriginal) + { + delete ref; + } + } +}; + + +const char* align_read(Aligner* a, const char* read1Fastq) +{ + static char qname[] = "a"; + a->ra->iRead++; + a->ra->readNmates = 1; + a->ra->readFastq[0] = read1Fastq; + a->ra->readName = qname; + int readStatus = a->ra->oneRead(); + a->ra->readName[1] = '\0'; + if(readStatus != 0) + { + return nullptr; + } + const char* str = a->ra->outputAlignments(); + return str; +} + +const char* align_read_pair(Aligner* a, const char* read1Fastq, const char* read2Fastq) +{ + static char qname[] = "a"; + a->ra->iRead++; + a->ra->readNmates = 2; + a->ra->readFastq[0] = read1Fastq; + a->ra->readFastq[1] = read2Fastq; + a->ra->readName = qname; + + int readStatus = a->ra->oneRead(); + a->ra->readName[1] = '\0'; + if(readStatus != 0) + { + return nullptr; + } + const char* str = a->ra->outputAlignments(); + return str; +} + +Aligner* init_aligner_clone(const Aligner* al) +{ + return new Aligner(al); +} + +Aligner* init_aligner(int argc, char* argv[]) +{ + return new Aligner(argc, argv); +} + +const StarRef* init_star_ref(int argc, char* argv[]) +{ + return new StarRef(argc, argv); +} + +Aligner* init_aligner_from_ref(const StarRef* sr) +{ + return new Aligner(sr); +} + +void destroy_aligner(Aligner *a) +{ + delete a; +} + +void destroy_ref(const StarRef* sr) +{ + delete sr; +} + diff --git a/star-sys/STAR/source/orbit.h b/star-sys/STAR/source/orbit.h new file mode 100644 index 0000000..70a3f7a --- /dev/null +++ b/star-sys/STAR/source/orbit.h @@ -0,0 +1,51 @@ +/* + * Orbit is a wrapper around the key functionality of STAR. It supports + * querying a previously built genome index with single reads and read pairs + * and obtaining BAM records with their alignments to the index. + */ + +#ifdef __cplusplus +extern "C" { +#endif + // StarRef: A separate struct for the parameters and genome information. + // This is so that this can be built in the main thread and declared as + // const to ensure safe access by the other threads + struct StarRef; + + // Aligner: all constructed alignment object which is used to align + // individual reads/read pairs through the functions below + struct Aligner; + + // align_read: align an individual read and get a string of BAM records + const char* align_read(struct Aligner*, const char*); + + // align_read_pair: align a pair of reads and get a string of BAM records + const char* align_read_pair(struct Aligner*, const char*, const char*); + + // init_aligner_clone: create an aligner from the same reference as an + // existing aligner, sharing key structures with it and saving memory in + // multi-threaded applications + // NOTE: this function is deprecated in favor of a StarRef-based workflow + struct Aligner* init_aligner_clone(const struct Aligner*); + + // init_aligner: initialize an aligner given the array of parameters which + // would be passed to STAR + struct Aligner* init_aligner(int, char*[]); + + // init_star_ref: build a star reference with a given set of arguments + const struct StarRef* init_star_ref(int, char*[]); + + // init_aligner_from_ref takes a StarRef struct with an already built + // genome and builds an aligner around it + struct Aligner* init_aligner_from_ref(const struct StarRef*); + + // destroy_aligner: frees the memory occupied by an aligner + void destroy_aligner(struct Aligner*); + + //destroy_ref: frees the memory occupied by a reference + void destroy_ref(const struct StarRef*); + +#ifdef __cplusplus +} +#endif + diff --git a/star-sys/STAR/source/outputSJ.cpp b/star-sys/STAR/source/outputSJ.cpp new file mode 100644 index 0000000..750b446 --- /dev/null +++ b/star-sys/STAR/source/outputSJ.cpp @@ -0,0 +1,160 @@ +#include "ReadAlignChunk.h" +#include "Parameters.h" +#include "OutSJ.h" +#include +#include "ErrorWarning.h" + +int compareUint(const void* i1, const void* i2) {//compare uint arrays + uint s1=*( (uint*)i1 ); + uint s2=*( (uint*)i2 ); + + if (s1>s2) { + return 1; + } else if (s1> Log.timing.out"); + + + Junction oneSJ(RAchunk[0]->mapGen); + char** sjChunks = new char* [P.runThreadN+1]; + #define OUTSJ_limitScale 5 + OutSJ allSJ (P.limitOutSJcollapsed*OUTSJ_limitScale,P,RAchunk[0]->mapGen); + + if (P.outFilterBySJoutStage!=1) {//chunkOutSJ + for (int ic=0;icchunkOutSJ->data; + memset(sjChunks[ic]+RAchunk[ic]->chunkOutSJ->N*oneSJ.dataSize,255,oneSJ.dataSize);//mark the junction after last with big number + }; + } else {//chunkOutSJ1 + for (int ic=0;icchunkOutSJ1->data; + memset(sjChunks[ic]+RAchunk[ic]->chunkOutSJ1->N*oneSJ.dataSize,255,oneSJ.dataSize);//mark the junction after last with big number + }; + }; + + while (true) { + int icOut=-1;//chunk from which the junction is output + for (int ic=0;ic0 \ + || ( ( *oneSJ.countUnique>=(uint) P.outSJfilterCountUniqueMin[(*oneSJ.motif+1)/2] \ + || (*oneSJ.countMultiple+*oneSJ.countUnique)>=(uint) P.outSJfilterCountTotalMin[(*oneSJ.motif+1)/2] )\ + && *oneSJ.overhangLeft >= (uint) P.outSJfilterOverhangMin[(*oneSJ.motif+1)/2] \ + && *oneSJ.overhangRight >= (uint) P.outSJfilterOverhangMin[(*oneSJ.motif+1)/2] \ + && ( (*oneSJ.countMultiple+*oneSJ.countUnique)>P.outSJfilterIntronMaxVsReadN.size() || *oneSJ.gap<=(uint) P.outSJfilterIntronMaxVsReadN[*oneSJ.countMultiple+*oneSJ.countUnique-1]) ); + + if (sjFilter) {//record the junction in all SJ + memcpy(allSJ.data+allSJ.N*oneSJ.dataSize,sjChunks[icOut],oneSJ.dataSize); + allSJ.N++; + if (allSJ.N == P.limitOutSJcollapsed*OUTSJ_limitScale ) { + ostringstream errOut; + errOut <<"EXITING because of fatal error: buffer size for SJ output is too small\n"; + errOut <<"Solution: increase input parameter --limitOutSJcollapsed\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + }; + + sjChunks[icOut] += oneSJ.dataSize;//shift icOut-chunk by one junction + }; + + bool* sjFilter=new bool[allSJ.N]; + if (P.outFilterBySJoutStage!=2) { + //filter non-canonical junctions that are close to canonical + uint* sjA = new uint [allSJ.N*3]; + for (uint ii=0;ii0) x1=*( (uint*)(allSJ.data+(ii-1)*oneSJ.dataSize) ); //previous junction donor + if (ii+1= (uint) P.outSJfilterDistToOtherSJmin[(*oneSJ.motif+1)/2]; + sjA[ii*3]=*oneSJ.start+(uint)*oneSJ.gap;//acceptor + sjA[ii*3+1]=ii; + + if (*oneSJ.annot==0) { + sjA[ii*3+2]=*oneSJ.motif; + } else { + sjA[ii*3+2]=SJ_MOTIF_SIZE+1; + }; + + }; + qsort((void*) sjA, allSJ.N, sizeof(uint)*3, compareUint); + for (uint ii=0;ii0) x1=sjA[ii*3-3]; //previous junction donor + if (ii+1= (uint) P.outSJfilterDistToOtherSJmin[(sjA[ii*3+2]+1)/2] ); + }; + }; + }; + + //output junctions + P.sjAll[0].reserve(allSJ.N); + P.sjAll[1].reserve(allSJ.N); + + if (P.outFilterBySJoutStage!=1) {//output file + ofstream outSJfileStream((P.outFileNamePrefix+"SJ.out.tab").c_str()); + for (uint ii=0;iilogMain <<"Detected " <0: genome files exact sizes in bytes. Typically, this should not be defined by the user. + +genomeConsensusFile - + string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele) + +### Genome Indexing Parameters - only used with --runMode genomeGenerate +genomeChrBinNbits 18 + int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + +genomeSAindexNbases 14 + int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1). + +genomeSAsparseD 1 + int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + +genomeSuffixLengthMax -1 + int: maximum length of the suffixes, has to be longer than read length. -1 = infinite. + + +### Splice Junctions Database +sjdbFileChrStartEnd - + string(s): path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied wand will be concatenated. + +sjdbGTFfile - + string: path to the GTF file with annotations + +sjdbGTFchrPrefix - + string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + +sjdbGTFfeatureExon exon + string: feature type in GTF file to be used as exons for building transcripts + +sjdbGTFtagExonParentTranscript transcript_id + string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + +sjdbGTFtagExonParentGene gene_id + string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + +sjdbGTFtagExonParentGeneName gene_name + string(s): GTF attrbute name for parent gene name + +sjdbGTFtagExonParentGeneType gene_type gene_biotype + string(s): GTF attrbute name for parent gene type + +sjdbOverhang 100 + int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + +sjdbScore 2 + int: extra alignment score for alignmets that cross database junctions + +sjdbInsertSave Basic + string: which files to save when sjdb junctions are inserted on the fly at the mapping step + Basic ... only small junction / transcript files + All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + +### Variation parameters +varVCFfile - + string: path to the VCF file that contains variation data. + +### Input Files +inputBAMfile - + string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + +### Read Parameters +readFilesType Fastx + string: format of input read files + Fastx ... FASTA or FASTQ + SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + +readFilesIn Read1 Read2 + string(s): paths to files that contain input read1 (and, if needed, read2) + +readFilesPrefix - + string: preifx for the read files names, i.e. it will be added in front of the strings in --readFilesIn + -: no prefix + +readFilesCommand - + string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +readMapNumber -1 + int: number of reads to map from the beginning of the file + -1: map all reads + +readMatesLengthsIn NotEqual + string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +readNameSeparator / + string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + +clip3pNbases 0 + int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip5pNbases 0 + int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq - + string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterMMp 0.1 + double(s): max proportion of mismatches for 3p adpater clipping for each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAfterAdapterNbases 0 + int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + + +### Limits +limitGenomeGenerateRAM 31000000000 + int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 150000000 + int>0: max available buffers size (bytes) for input/output, per thread + +limitOutSAMoneReadBytes 100000 + int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax + +limitOutSJoneRead 1000 + int>0: max number of junctions for one read (including all multi-mappers) + +limitOutSJcollapsed 1000000 + int>0: max number of collapsed junctions + +limitBAMsortRAM 0 + int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option. + +limitSjdbInsertNsj 1000000 + int>=0: maximum number of junction to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + +limitNreadsSoft -1 + int: soft limit on the number of reads + +### Output: general +outFileNamePrefix ./ + string: output files name prefix (including full or relative path). Can only be defined on the command line. + +outTmpDir - + string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! + - the temp directory will default to outFileNamePrefix_STARtmp + +outTmpKeep None + string: whether to keep the tempporary files after STAR runs is finished + None ... remove all temporary files + All .. keep all files + +outStd Log + string: which output will be directed to stdout (standard out) + Log ... log messages + SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + BAM_SortedByCoordinate ... alignments in BAM format, unsorted. Requires --outSAMtype BAM SortedByCoordinate + BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped None + string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + None ... no output + Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + +outQSconversionAdd 0 + int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + +outMultimapperOrder Old_2.4 + string: order of multimapping alignments in the output files + Old_2.4 ... quasi-random order used before 2.5.0 + Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + +### Output: SAM and BAM +outSAMtype SAM + strings: type of SAM/BAM output + 1st word: + BAM ... output BAM without sorting + SAM ... output SAM without sorting + None ... no SAM/BAM output + 2nd, 3rd: + Unsorted ... standard unsorted + SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + +outSAMmode Full + string: mode of SAM output + None ... no SAM output + Full ... full SAM output + NoQS ... full SAM but without quality scores + +outSAMstrandField None + string: Cufflinks-like strand field flag + None ... not used + intronMotif ... strand derived from the intron motif. Reads with inconsistent and/or non-canonical introns are filtered out. + +outSAMattributes Standard + string: a string of desired SAM attributes, in the order desired for the output SAM + NH HI AS nM NM MD jM jI XS MC ch ... any combination in any order + None ... no attributes + Standard ... NH HI AS nM + All ... NH HI AS nM NM MD jM jI MC ch + vA ... variant allele + vG ... genomic coordiante of the variant overlapped by the read + vW ... 0/1 - alignment does not pass / passes WASP filtering. Requires --waspOutputMode SAMtag + CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing + Unsupported/undocumented: + rB ... alignment block read/genomic coordinates + vR ... read coordinate of the variant + +outSAMattrIHstart 1 + int>=0: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + +outSAMunmapped None + string(s): output of unmapped reads in the SAM format + 1st word: + None ... no output + Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + +outSAMorder Paired + string: type of sorting for the SAM output + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +outSAMprimaryFlag OneBestScore + string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + OneBestScore ... only one alignment with the best score is primary + AllBestScore ... all alignments with the best score are primary + +outSAMreadID Standard + string: read ID record type + Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + Number ... read number (index) in the FASTx file + +outSAMmapqUnique 255 + int: 0 to 255: the MAPQ value for unique mappers + +outSAMflagOR 0 + int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise. + +outSAMflagAND 65535 + int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise. + +outSAMattrRGline - + string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + +outSAMheaderHD - + strings: @HD (header) line of the SAM header + +outSAMheaderPG - + strings: extra @PG (software) line of the SAM header (in addition to STAR) + +outSAMheaderCommentFile - + string: path to the file with @CO (comment) lines of the SAM header + +outSAMfilter None + string(s): filter the output into main SAM/BAM files + KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + + +outSAMmultNmax -1 + int: max number of multiple alignments for a read that will be output to the SAM/BAM files. + -1 ... all alignments (up to --outFilterMultimapNmax) will be output + +outSAMtlen 1 + int: calculation method for the TLEN field in the SAM/BAM files + 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + +outBAMcompression 1 + int: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + +outBAMsortingThreadN 0 + int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN). + +outBAMsortingBinsN 50 + int: >0: number of genome bins fo coordinate-sorting + +### BAM processing +bamRemoveDuplicatesType - + string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + - ... no duplicate removal/marking + UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + +bamRemoveDuplicatesMate2basesN 0 + int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + +### Output Wiggle +outWigType None + string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + 1st word: + None ... no signal output + bedGraph ... bedGraph format + wiggle ... wiggle format + 2nd word: + read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + read2 ... signal from only 2nd read + +outWigStrand Stranded + string: strandedness of wiggle/bedGraph output + Stranded ... separate strands, str1 and str2 + Unstranded ... collapsed strands + +outWigReferencesPrefix - + string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references + +outWigNorm RPM + string: type of normalization for the signal + RPM ... reads per million of mapped reads + None ... no normalization, "raw" counts + +### Output Filtering +outFilterType Normal + string: type of filtering + Normal ... standard filtering using only current alignment + BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + +outFilterMultimapScoreRange 1 + int: the score range below the maximum score for multimapping alignments + +outFilterMultimapNmax 10 + int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + +outFilterMismatchNmax 10 + int: alignment will be output only if it has no more mismatches than this value. + +outFilterMismatchNoverLmax 0.3 + real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + +outFilterMismatchNoverReadLmax 1.0 + real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + + +outFilterScoreMin 0 + int: alignment will be output only if its score is higher than or equal to this value. + +outFilterScoreMinOverLread 0.66 + real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + +outFilterMatchNmin 0 + int: alignment will be output only if the number of matched bases is higher than or equal to this value. + +outFilterMatchNminOverLread 0.66 + real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + +outFilterIntronMotifs None + string: filter alignment using their motifs + None ... no filtering + RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + +outFilterIntronStrands RemoveInconsistentStrands + string: filter alignments + RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + None ... no filtering + +### Output Filtering: Splice Junctions +outSJfilterReads All + string: which reads to consider for collapsed splice junctions output + All: all reads, unique- and multi-mappers + Unique: uniquely mapping reads only + +outSJfilterOverhangMin 30 12 12 12 + 4 integers: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + does not apply to annotated junctions + +outSJfilterCountUniqueMin 3 1 1 1 + 4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterCountTotalMin 3 1 1 1 + 4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 10 0 5 10 + 4 integers>=0: minimum allowed distance to other junctions' donor/acceptor + does not apply to annotated junctions + +outSJfilterIntronMaxVsReadN 50000 100000 200000 + N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + +### Scoring +scoreGap 0 + int: splice junction penalty (independent on intron motif) + +scoreGapNoncan -8 + int: non-canonical junction penalty (in addition to scoreGap) + +scoreGapGCAG -4 + GC/AG and CT/GC junction penalty (in addition to scoreGap) + +scoreGapATAC -8 + AT/AC and GT/AT junction penalty (in addition to scoreGap) + +scoreGenomicLengthLog2scale -0.25 + extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength) + +scoreDelOpen -2 + deletion open penalty + +scoreDelBase -2 + deletion extension penalty per base (in addition to scoreDelOpen) + +scoreInsOpen -2 + insertion open penalty + +scoreInsBase -2 + insertion extension penalty per base (in addition to scoreInsOpen) + +scoreStitchSJshift 1 + maximum score reduction while searching for SJ boundaries inthe stitching step + + +### Alignments and Seeding + +seedSearchStartLmax 50 + int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +seedSearchStartLmaxOverLread 1.0 + real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +seedSearchLmax 0 + int>=0: defines the maximum length of the seeds, if =0 max seed lengthis infinite + +seedMultimapNmax 10000 + int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +seedPerReadNmax 1000 + int>0: max number of seeds per read + +seedPerWindowNmax 50 + int>0: max number of seeds per window + +seedNoneLociPerWindow 10 + int>0: max number of one seed loci per window + +seedSplitMin 12 + int>0: min length of the seed sequences split by Ns or mate gap + +alignIntronMin 21 + minimum intron size: genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + +alignIntronMax 0 + maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignMatesGapMax 0 + maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignSJoverhangMin 5 + int>0: minimum overhang (i.e. block size) for spliced alignments + +alignSJstitchMismatchNmax 0 -1 0 0 + 4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit). + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + +alignSJDBoverhangMin 3 + int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +alignSplicedMateMapLmin 0 + int>0: minimum mapped length for a read mate that is spliced + +alignSplicedMateMapLminOverLmate 0.66 + real>0: alignSplicedMateMapLmin normalized to mate length + +alignWindowsPerReadNmax 10000 + int>0: max number of windows per read + +alignTranscriptsPerWindowNmax 100 + int>0: max number of transcripts per window + +alignTranscriptsPerReadNmax 10000 + int>0: max number of different alignments per read to consider + +alignEndsType Local + string: type of read ends alignment + Local ... standard local alignment with soft-clipping allowed + EndToEnd ... force end-to-end read alignment, do not soft-clip + Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + +alignEndsProtrude 0 ConcordantPair + int, string: allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + +alignSoftClipAtReferenceEnds Yes + string: allow the soft-clipping of the alignments past the end of the chromosomes + Yes ... allow + No ... prohibit, useful for compatibility with Cufflinks + +alignInsertionFlush None + string: how to flush ambiguous insertion positions + None ... insertions are not flushed + Right ... insertions are flushed to the right + +### Paired-End reads +peOverlapNbasesMin 0 + int>=0: minimum number of overlap bases to trigger mates merging and realignment + +peOverlapMMp 0.01 + real, >=0 & <1: maximum proportion of mismatched bases in the overlap area + +### Windows, Anchors, Binning + +winAnchorMultimapNmax 50 + int>0: max number of loci anchors are allowed to map to + +winBinNbits 16 + int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + +winAnchorDistNbins 9 + int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +winFlankNbins 4 + int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + +winReadCoverageRelativeMin 0.5 + real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + +winReadCoverageBasesMin 0 + int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + +### Chimeric Alignments +chimOutType Junctions + string(s): type of chimeric output + Junctions ... Chimeric.out.junction + SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (defaultif no 2nd word is present) + WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + +chimSegmentMin 0 + int>=0: minimum length of chimeric segment length, if ==0, no chimeric output + +chimScoreMin 0 + int>=0: minimum total (summed) score of the chimeric segments + +chimScoreDropMax 20 + int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + +chimScoreSeparation 10 + int>=0: minimum difference (separation) between the best chimeric score and the next one + +chimScoreJunctionNonGTAG -1 + int: penalty for a non-GT/AG chimeric junction + +chimJunctionOverhangMin 20 + int>=0: minimum overhang for a chimeric junction + +chimSegmentReadGapMax 0 + int>=0: maximum gap in the read sequence between chimeric segments + +chimFilter banGenomicN + string(s): different filters for chimeric alignments + None ... no filtering + banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + +chimMainSegmentMultNmax 10 + int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + +chimMultimapNmax 0 + int>=0: maximum number of chimeric multi-alignments + 0 ... use the old scheme for chimeric detection which only considered unique alignments + +chimMultimapScoreRange 1 + int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1 + +chimNonchimScoreDropMin 20 + int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be smaller than this value + +chimOutJunctionFormat 0 + int: formatting type for the Chimeric.out.junction file + 0 ... no comment lines/headers + 1 ... comment lines at the end of the file: command line and Nreads: total, unique, multi + +### Quantification of Annotations +quantMode - + string(s): types of quantification requested + - ... none + TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + GeneCounts ... count reads per gene + +quantTranscriptomeBAMcompression 1 1 + int: -2 to 10 transcriptome BAM compression level + -2 ... no BAM output + -1 ... default compression (6?) + 0 ... no compression + 10 ... maximum compression + +quantTranscriptomeBan IndelSoftclipSingleend + string: prohibit various alignment type + IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM + Singleend ... prohibit single-end alignments + +### 2-pass Mapping +twopassMode None + string: 2-pass mapping mode. + None ... 1-pass mapping + Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + +twopass1readsN -1 + int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + + +### WASP parameters +waspOutputMode None + string: WASP allele-specific output type. This is re-implemenation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061–1063 (2015), https://www.nature.com/articles/nmeth.3582 . + SAMtag ... add WASP tags to the alignments that pass WASP filtering + +### STARsolo (single cell RNA-seq) parameters +soloType None + string(s): type of single-cell RNA-seq + Droplet ... one cell barcode and one UMI barcode in read2, e.g. Drop-seq and 10X Chromium + +soloCBwhitelist - + string: file with whitelist of cell barcodes + +soloCBstart 1 + int>0: cell barcode start base + +soloCBlen 16 + int>0: cell barcode length + +soloUMIstart 17 + int>0: UMI start base + +soloUMIlen 10 + int>0: UMI length + +soloBarcodeReadLength 1 + int: length of the barcode read + 1 ... equal to sum of soloCBlen+soloUMIlen + 0 ... not defined, do not check + +soloStrand Forward + string: strandedness of the solo libraries: + Unstranded ... no strand information + Forward ... read strand same as the original RNA molecule + Reverse ... read strand opposite to the original RNA molecule + +soloFeatures Gene + string(s): genomic features for which the UMI counts per Cell Barcode are collected + Gene ... genes: reads match the gene transcript + SJ ... splice junctions: reported in SJ.out.tab + GeneFull ... full genes: count all reads overlapping genes' exons and introns + +soloUMIdedup 1MM_All + string(s): type of UMI deduplication (collapsing) algorithm + 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once) + 1MM_Directional ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + 1MM_NotCollapsed ... UMIs with 1 mismatch distance to others are not collapsed (i.e. all counted) + +soloOutFileNames Solo.out/ genes.tsv barcodes.tsv matrix.mtx matrixSJ.mtx matrixGeneFull.mtx + string(s) file names for STARsolo output + 1st word ... file name prefix + 2nd word ... gene IDs and names + 3rd word ... barcode sequences + 4th word ... cell/Gene counts matrix + 5th word ... cell/SJ counts matrix + 6th word ... cell/GeneFull counts matrix + diff --git a/star-sys/STAR/source/parametersDefault.xxd b/star-sys/STAR/source/parametersDefault.xxd new file mode 100644 index 0000000..ddc09a3 --- /dev/null +++ b/star-sys/STAR/source/parametersDefault.xxd @@ -0,0 +1,3202 @@ +unsigned char parametersDefault[] = { + 0x23, 0x23, 0x23, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, + 0x0a, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x47, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x32, 0x2e, 0x37, 0x2e, 0x31, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x65, 0x61, 0x72, 0x6c, + 0x69, 0x65, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, + 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, + 0x6e, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x74, 0x69, 0x62, 0x6c, 0x65, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x53, + 0x54, 0x41, 0x52, 0x20, 0x72, 0x65, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x2e, + 0x20, 0x50, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x20, 0x64, 0x6f, 0x20, 0x6e, + 0x6f, 0x74, 0x20, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x21, 0x0a, 0x0a, 0x23, + 0x23, 0x23, 0x20, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, + 0x20, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x0a, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x65, 0x74, 0x65, 0x72, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x20, 0x75, 0x73, 0x65, 0x72, + 0x2d, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x2c, 0x20, 0x22, 0x2d, 0x22, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x2e, + 0x20, 0x43, 0x61, 0x6e, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x62, 0x65, + 0x20, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x20, + 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x53, + 0x79, 0x73, 0x74, 0x65, 0x6d, 0x0a, 0x73, 0x79, 0x73, 0x53, 0x68, 0x65, + 0x6c, 0x6c, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x73, 0x68, 0x65, 0x6c, 0x6c, 0x20, 0x62, 0x69, + 0x6e, 0x61, 0x72, 0x79, 0x2c, 0x20, 0x70, 0x72, 0x65, 0x66, 0x65, 0x72, + 0x61, 0x62, 0x6c, 0x79, 0x20, 0x62, 0x61, 0x73, 0x68, 0x2c, 0x20, 0x65, + 0x2e, 0x67, 0x2e, 0x20, 0x2f, 0x62, 0x69, 0x6e, 0x2f, 0x62, 0x61, 0x73, + 0x68, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x65, 0x66, + 0x61, 0x75, 0x6c, 0x74, 0x20, 0x73, 0x68, 0x65, 0x6c, 0x6c, 0x20, 0x69, + 0x73, 0x20, 0x65, 0x78, 0x65, 0x63, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x20, + 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x2f, 0x62, + 0x69, 0x6e, 0x2f, 0x73, 0x68, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, + 0x77, 0x61, 0x73, 0x20, 0x72, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, + 0x20, 0x74, 0x6f, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x20, 0x6f, 0x6e, 0x20, + 0x73, 0x6f, 0x6d, 0x65, 0x20, 0x55, 0x62, 0x75, 0x6e, 0x74, 0x75, 0x20, + 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x74, 0x68, + 0x65, 0x6e, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x20, + 0x74, 0x6f, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x79, 0x20, 0x70, + 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x61, 0x73, 0x68, 0x2e, + 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x52, 0x75, 0x6e, 0x20, 0x50, 0x61, + 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, 0x72, 0x75, 0x6e, + 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x52, 0x65, + 0x61, 0x64, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x72, 0x75, 0x6e, 0x2e, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x52, + 0x65, 0x61, 0x64, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6d, 0x61, 0x70, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x47, 0x65, 0x6e, + 0x65, 0x72, 0x61, 0x74, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, + 0x74, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, + 0x6e, 0x74, 0x73, 0x46, 0x72, 0x6f, 0x6d, 0x42, 0x41, 0x4d, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x66, 0x72, 0x6f, 0x6d, + 0x20, 0x42, 0x41, 0x4d, 0x2e, 0x20, 0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, + 0x74, 0x6c, 0x79, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x77, 0x6f, 0x72, + 0x6b, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x6f, 0x75, + 0x74, 0x57, 0x69, 0x67, 0x54, 0x79, 0x70, 0x65, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x2d, 0x2d, 0x62, 0x61, 0x6d, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, + 0x44, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x66, 0x74, + 0x4f, 0x76, 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6c, + 0x69, 0x66, 0x74, 0x2d, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, + 0x47, 0x54, 0x46, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x28, 0x2d, + 0x2d, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, 0x46, 0x66, 0x69, 0x6c, 0x65, + 0x29, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x67, 0x65, + 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x61, 0x73, 0x73, 0x65, 0x6d, 0x62, 0x6c, + 0x69, 0x65, 0x73, 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x63, 0x68, + 0x61, 0x69, 0x6e, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x28, 0x73, 0x29, 0x20, + 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x2d, 0x2d, 0x67, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x43, 0x68, 0x61, 0x69, 0x6e, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x2e, + 0x0a, 0x0a, 0x72, 0x75, 0x6e, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x4e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, + 0x64, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x75, 0x6e, 0x20, 0x53, 0x54, + 0x41, 0x52, 0x0a, 0x0a, 0x72, 0x75, 0x6e, 0x44, 0x69, 0x72, 0x50, 0x65, + 0x72, 0x6d, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x55, 0x73, 0x65, 0x72, 0x5f, 0x52, 0x57, 0x58, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x65, 0x72, + 0x6d, 0x69, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, + 0x72, 0x69, 0x65, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, + 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x75, 0x6e, 0x2d, + 0x74, 0x69, 0x6d, 0x65, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x55, 0x73, 0x65, 0x72, 0x5f, 0x52, 0x57, 0x58, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x61, 0x64, + 0x2f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x2f, 0x65, 0x78, 0x65, 0x63, 0x75, + 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x41, + 0x6c, 0x6c, 0x5f, 0x52, 0x57, 0x58, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x61, 0x6c, 0x6c, 0x2d, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x77, 0x72, 0x69, + 0x74, 0x65, 0x2f, 0x65, 0x78, 0x65, 0x63, 0x75, 0x74, 0x65, 0x20, 0x28, + 0x73, 0x61, 0x6d, 0x65, 0x20, 0x61, 0x73, 0x20, 0x63, 0x68, 0x6d, 0x6f, + 0x64, 0x20, 0x37, 0x37, 0x37, 0x29, 0x0a, 0x0a, 0x72, 0x75, 0x6e, 0x52, + 0x4e, 0x47, 0x73, 0x65, 0x65, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x37, 0x37, 0x37, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, + 0x61, 0x74, 0x6f, 0x72, 0x20, 0x73, 0x65, 0x65, 0x64, 0x2e, 0x0a, 0x0a, + 0x0a, 0x23, 0x23, 0x23, 0x20, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, + 0x50, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, 0x67, + 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x44, 0x69, 0x72, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x2f, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x44, + 0x69, 0x72, 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, + 0x79, 0x20, 0x77, 0x68, 0x65, 0x72, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x61, 0x72, 0x65, + 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x20, 0x28, 0x66, 0x6f, 0x72, + 0x20, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x52, 0x65, 0x61, 0x64, 0x73, 0x29, 0x20, 0x6f, + 0x72, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x67, 0x65, + 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x64, 0x20, 0x28, 0x66, 0x6f, 0x72, + 0x20, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x67, + 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x47, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x29, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x4c, 0x6f, + 0x61, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x53, 0x68, 0x61, 0x72, + 0x65, 0x64, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x6d, 0x6f, 0x64, + 0x65, 0x20, 0x6f, 0x66, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x20, + 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x75, 0x73, 0x61, 0x67, 0x65, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x20, 0x4f, + 0x6e, 0x6c, 0x79, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, + 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x52, 0x65, 0x61, 0x64, 0x73, 0x2e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x4b, 0x65, 0x65, + 0x70, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6c, 0x6f, + 0x61, 0x64, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x69, 0x6e, + 0x74, 0x6f, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x6b, 0x65, 0x65, 0x70, 0x20, 0x69, 0x74, 0x20, 0x69, 0x6e, + 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x61, 0x66, 0x74, 0x65, + 0x72, 0x20, 0x72, 0x75, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4c, 0x6f, 0x61, 0x64, + 0x41, 0x6e, 0x64, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x73, 0x68, 0x61, + 0x72, 0x65, 0x64, 0x20, 0x62, 0x75, 0x74, 0x20, 0x72, 0x65, 0x6d, 0x6f, + 0x76, 0x65, 0x20, 0x69, 0x74, 0x20, 0x61, 0x66, 0x74, 0x65, 0x72, 0x20, + 0x72, 0x75, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, + 0x64, 0x45, 0x78, 0x69, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, + 0x64, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x65, 0x78, 0x69, 0x74, 0x2c, 0x20, 0x6b, 0x65, 0x65, 0x70, 0x69, + 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x20, 0x69, 0x6e, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x66, 0x75, 0x74, 0x75, 0x72, 0x65, 0x20, 0x72, + 0x75, 0x6e, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x64, 0x6f, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6d, 0x61, 0x70, + 0x20, 0x61, 0x6e, 0x79, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2c, 0x20, 0x6a, + 0x75, 0x73, 0x74, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c, + 0x6f, 0x61, 0x64, 0x65, 0x64, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x53, 0x68, 0x61, 0x72, 0x65, 0x64, 0x4d, + 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x64, + 0x6f, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x73, 0x68, + 0x61, 0x72, 0x65, 0x64, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x2c, + 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6a, 0x6f, 0x62, 0x20, 0x77, 0x69, + 0x6c, 0x6c, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x69, 0x74, 0x73, 0x20, + 0x6f, 0x77, 0x6e, 0x20, 0x70, 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, + 0x63, 0x6f, 0x70, 0x79, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x46, 0x61, 0x73, 0x74, 0x61, 0x46, 0x69, 0x6c, 0x65, 0x73, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x28, 0x73, 0x29, 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x28, 0x73, 0x29, + 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x61, 0x73, 0x74, + 0x61, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x73, 0x2c, 0x20, 0x73, + 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, + 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x2e, 0x20, 0x54, 0x68, 0x65, 0x73, + 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x73, 0x68, 0x6f, 0x75, + 0x6c, 0x64, 0x20, 0x62, 0x65, 0x20, 0x70, 0x6c, 0x61, 0x69, 0x6e, 0x20, + 0x74, 0x65, 0x78, 0x74, 0x20, 0x46, 0x41, 0x53, 0x54, 0x41, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x73, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x79, 0x20, 0x2a, + 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x2a, 0x20, 0x62, 0x65, 0x20, 0x7a, + 0x69, 0x70, 0x70, 0x65, 0x64, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, + 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x67, + 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x28, 0x2d, + 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x29, + 0x2e, 0x20, 0x43, 0x61, 0x6e, 0x20, 0x61, 0x6c, 0x73, 0x6f, 0x20, 0x62, + 0x65, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x28, 0x2d, + 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x52, 0x65, 0x61, 0x64, 0x73, 0x29, 0x20, 0x74, 0x6f, 0x20, + 0x61, 0x64, 0x64, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x28, 0x6e, + 0x65, 0x77, 0x29, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, + 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x20, 0x28, 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x73, 0x70, + 0x69, 0x6b, 0x65, 0x2d, 0x69, 0x6e, 0x73, 0x29, 0x2e, 0x0a, 0x0a, 0x67, + 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x43, 0x68, 0x61, 0x69, 0x6e, 0x46, 0x69, + 0x6c, 0x65, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x63, 0x68, 0x61, 0x69, 0x6e, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x69, 0x63, 0x20, 0x6c, 0x69, 0x66, 0x74, 0x6f, 0x76, 0x65, + 0x72, 0x2e, 0x20, 0x4f, 0x6e, 0x6c, 0x79, 0x20, 0x75, 0x73, 0x65, 0x64, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x4d, + 0x6f, 0x64, 0x65, 0x20, 0x6c, 0x69, 0x66, 0x74, 0x4f, 0x76, 0x65, 0x72, + 0x20, 0x2e, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x46, 0x69, + 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x73, 0x29, 0x3e, 0x30, 0x3a, + 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x73, 0x20, 0x65, 0x78, 0x61, 0x63, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, + 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73, 0x2e, 0x20, + 0x54, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x2c, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x20, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x20, 0x6e, + 0x6f, 0x74, 0x20, 0x62, 0x65, 0x20, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, + 0x64, 0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, + 0x72, 0x2e, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x43, 0x6f, + 0x6e, 0x73, 0x65, 0x6e, 0x73, 0x75, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x56, 0x43, + 0x46, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x6e, 0x73, 0x75, 0x73, 0x20, 0x53, 0x4e, + 0x50, 0x73, 0x20, 0x28, 0x69, 0x2e, 0x65, 0x2e, 0x20, 0x61, 0x6c, 0x74, + 0x65, 0x72, 0x6e, 0x61, 0x74, 0x69, 0x76, 0x65, 0x20, 0x61, 0x6c, 0x6c, + 0x65, 0x6c, 0x65, 0x20, 0x69, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, + 0x61, 0x6a, 0x6f, 0x72, 0x20, 0x28, 0x41, 0x46, 0x3e, 0x30, 0x2e, 0x35, + 0x29, 0x20, 0x61, 0x6c, 0x6c, 0x65, 0x6c, 0x65, 0x29, 0x0a, 0x0a, 0x23, + 0x23, 0x23, 0x20, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x49, 0x6e, + 0x64, 0x65, 0x78, 0x69, 0x6e, 0x67, 0x20, 0x50, 0x61, 0x72, 0x61, 0x6d, + 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x2d, 0x20, 0x6f, 0x6e, 0x6c, 0x79, + 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x2d, + 0x2d, 0x72, 0x75, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x0a, + 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x43, 0x68, 0x72, 0x42, 0x69, 0x6e, + 0x4e, 0x62, 0x69, 0x74, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x31, 0x38, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3a, 0x20, 0x3d, 0x6c, 0x6f, 0x67, 0x32, 0x28, 0x63, 0x68, + 0x72, 0x42, 0x69, 0x6e, 0x29, 0x2c, 0x20, 0x77, 0x68, 0x65, 0x72, 0x65, + 0x20, 0x63, 0x68, 0x72, 0x42, 0x69, 0x6e, 0x20, 0x69, 0x73, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x62, 0x69, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61, + 0x67, 0x65, 0x3a, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x63, 0x68, 0x72, + 0x6f, 0x6d, 0x6f, 0x73, 0x6f, 0x6d, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, + 0x20, 0x6f, 0x63, 0x63, 0x75, 0x70, 0x79, 0x20, 0x61, 0x6e, 0x20, 0x69, + 0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x20, 0x6f, 0x66, 0x20, 0x62, 0x69, 0x6e, 0x73, 0x2e, 0x20, 0x46, + 0x6f, 0x72, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, + 0x77, 0x69, 0x74, 0x68, 0x20, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x20, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x69, 0x67, 0x73, 0x2c, 0x20, 0x69, 0x74, 0x20, 0x69, 0x73, 0x20, + 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65, 0x6e, 0x64, 0x65, 0x64, 0x20, + 0x74, 0x6f, 0x20, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x20, + 0x61, 0x73, 0x20, 0x6d, 0x69, 0x6e, 0x28, 0x31, 0x38, 0x2c, 0x20, 0x6c, + 0x6f, 0x67, 0x32, 0x5b, 0x6d, 0x61, 0x78, 0x28, 0x47, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2f, 0x4e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x4f, 0x66, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, + 0x63, 0x65, 0x73, 0x2c, 0x52, 0x65, 0x61, 0x64, 0x4c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x29, 0x5d, 0x29, 0x2e, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x53, 0x41, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4e, 0x62, 0x61, + 0x73, 0x65, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x34, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, + 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x28, 0x62, 0x61, 0x73, 0x65, + 0x73, 0x29, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, 0x41, + 0x20, 0x70, 0x72, 0x65, 0x2d, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x69, 0x6e, + 0x67, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x2e, 0x20, 0x54, 0x79, + 0x70, 0x69, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x62, 0x65, 0x74, 0x77, + 0x65, 0x65, 0x6e, 0x20, 0x31, 0x30, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x31, + 0x35, 0x2e, 0x20, 0x4c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x20, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x75, + 0x73, 0x65, 0x20, 0x6d, 0x75, 0x63, 0x68, 0x20, 0x6d, 0x6f, 0x72, 0x65, + 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x62, 0x75, 0x74, + 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x20, 0x66, 0x61, 0x73, 0x74, 0x65, + 0x72, 0x20, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, 0x65, 0x73, 0x2e, 0x20, + 0x46, 0x6f, 0x72, 0x20, 0x73, 0x6d, 0x61, 0x6c, 0x6c, 0x20, 0x67, 0x65, + 0x6e, 0x6f, 0x6d, 0x65, 0x73, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x20, 0x2d, 0x2d, 0x67, + 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x53, 0x41, 0x69, 0x6e, 0x64, 0x65, 0x78, + 0x4e, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, 0x6d, 0x75, 0x73, 0x74, 0x20, + 0x62, 0x65, 0x20, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x64, 0x20, 0x64, 0x6f, + 0x77, 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x69, 0x6e, 0x28, 0x31, 0x34, + 0x2c, 0x20, 0x6c, 0x6f, 0x67, 0x32, 0x28, 0x47, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x2f, 0x32, 0x20, 0x2d, + 0x20, 0x31, 0x29, 0x2e, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x53, 0x41, 0x73, 0x70, 0x61, 0x72, 0x73, 0x65, 0x44, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x73, + 0x75, 0x66, 0x66, 0x75, 0x78, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x20, + 0x73, 0x70, 0x61, 0x72, 0x73, 0x69, 0x74, 0x79, 0x2c, 0x20, 0x69, 0x2e, + 0x65, 0x2e, 0x20, 0x64, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x20, + 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x69, 0x6e, 0x64, 0x69, + 0x63, 0x65, 0x73, 0x3a, 0x20, 0x75, 0x73, 0x65, 0x20, 0x62, 0x69, 0x67, + 0x67, 0x65, 0x72, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x20, + 0x74, 0x6f, 0x20, 0x64, 0x65, 0x63, 0x72, 0x65, 0x61, 0x73, 0x65, 0x20, + 0x6e, 0x65, 0x65, 0x64, 0x65, 0x64, 0x20, 0x52, 0x41, 0x4d, 0x20, 0x61, + 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x73, 0x74, 0x20, 0x6f, + 0x66, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x70, + 0x65, 0x65, 0x64, 0x20, 0x72, 0x65, 0x64, 0x75, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x53, 0x75, 0x66, + 0x66, 0x69, 0x78, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x4d, 0x61, 0x78, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, + 0x75, 0x6d, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x65, + 0x73, 0x2c, 0x20, 0x68, 0x61, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, + 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x2e, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x69, 0x6e, 0x66, 0x69, 0x6e, + 0x69, 0x74, 0x65, 0x2e, 0x0a, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x53, + 0x70, 0x6c, 0x69, 0x63, 0x65, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x20, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, + 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x46, 0x69, 0x6c, 0x65, 0x43, 0x68, 0x72, + 0x53, 0x74, 0x61, 0x72, 0x74, 0x45, 0x6e, 0x64, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x70, 0x61, + 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x69, 0x63, 0x20, 0x63, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, + 0x61, 0x74, 0x65, 0x73, 0x20, 0x28, 0x63, 0x68, 0x72, 0x20, 0x3c, 0x74, + 0x61, 0x62, 0x3e, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x3c, 0x74, + 0x61, 0x62, 0x3e, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x3c, 0x74, 0x61, 0x62, + 0x3e, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x29, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, + 0x74, 0x72, 0x6f, 0x6e, 0x73, 0x2e, 0x20, 0x4d, 0x75, 0x6c, 0x74, 0x69, + 0x70, 0x6c, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x63, 0x61, + 0x6e, 0x20, 0x62, 0x65, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6c, 0x69, 0x65, + 0x64, 0x20, 0x77, 0x61, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, + 0x62, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x6e, 0x61, + 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, + 0x46, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x47, 0x54, 0x46, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, 0x46, 0x63, 0x68, + 0x72, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x72, 0x65, + 0x66, 0x69, 0x78, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x63, 0x68, 0x72, 0x6f, + 0x6d, 0x6f, 0x73, 0x6f, 0x6d, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x73, + 0x20, 0x69, 0x6e, 0x20, 0x61, 0x20, 0x47, 0x54, 0x46, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x20, 0x28, 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x27, 0x63, 0x68, + 0x72, 0x27, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, + 0x20, 0x45, 0x4e, 0x53, 0x4d, 0x45, 0x42, 0x4c, 0x20, 0x61, 0x6e, 0x6e, + 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x55, 0x43, 0x53, 0x43, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, + 0x65, 0x73, 0x29, 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, 0x46, + 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x45, 0x78, 0x6f, 0x6e, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x78, 0x6f, + 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x47, 0x54, 0x46, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x75, 0x73, 0x65, + 0x64, 0x20, 0x61, 0x73, 0x20, 0x65, 0x78, 0x6f, 0x6e, 0x73, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x69, 0x6e, 0x67, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x0a, + 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, 0x46, 0x74, 0x61, 0x67, 0x45, + 0x78, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x54, 0x72, 0x61, + 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, + 0x69, 0x70, 0x74, 0x5f, 0x69, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x47, 0x54, 0x46, 0x20, 0x61, + 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20, 0x6e, 0x61, 0x6d, + 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, + 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, + 0x49, 0x44, 0x20, 0x28, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, + 0x22, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x5f, + 0x69, 0x64, 0x22, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x47, 0x54, 0x46, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x29, + 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, 0x46, 0x74, 0x61, 0x67, + 0x45, 0x78, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x47, 0x65, + 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x5f, 0x69, + 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x47, 0x54, 0x46, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, + 0x75, 0x74, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, + 0x20, 0x49, 0x44, 0x20, 0x28, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x20, 0x22, 0x67, 0x65, 0x6e, 0x65, 0x5f, 0x69, 0x64, 0x22, 0x20, 0x77, + 0x6f, 0x72, 0x6b, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x47, 0x54, 0x46, + 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x29, 0x0a, 0x0a, 0x73, 0x6a, 0x64, + 0x62, 0x47, 0x54, 0x46, 0x74, 0x61, 0x67, 0x45, 0x78, 0x6f, 0x6e, 0x50, + 0x61, 0x72, 0x65, 0x6e, 0x74, 0x47, 0x65, 0x6e, 0x65, 0x4e, 0x61, 0x6d, + 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x65, 0x6e, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, + 0x3a, 0x20, 0x47, 0x54, 0x46, 0x20, 0x61, 0x74, 0x74, 0x72, 0x62, 0x75, + 0x74, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x47, 0x54, + 0x46, 0x74, 0x61, 0x67, 0x45, 0x78, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x65, + 0x6e, 0x74, 0x47, 0x65, 0x6e, 0x65, 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, + 0x6e, 0x65, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x65, + 0x5f, 0x62, 0x69, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, + 0x47, 0x54, 0x46, 0x20, 0x61, 0x74, 0x74, 0x72, 0x62, 0x75, 0x74, 0x65, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, + 0x72, 0x65, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x0a, 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x4f, 0x76, 0x65, 0x72, + 0x68, 0x61, 0x6e, 0x67, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x30, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x64, 0x6f, 0x6e, 0x6f, 0x72, 0x2f, 0x61, 0x63, 0x63, 0x65, 0x70, + 0x74, 0x6f, 0x72, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, + 0x20, 0x6f, 0x6e, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x73, 0x69, 0x64, + 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2c, 0x20, 0x69, 0x64, 0x65, 0x61, + 0x6c, 0x6c, 0x79, 0x20, 0x3d, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x65, 0x5f, + 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x29, 0x0a, + 0x0a, 0x73, 0x6a, 0x64, 0x62, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3a, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x73, 0x63, 0x6f, 0x72, + 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, + 0x65, 0x74, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x63, 0x72, 0x6f, + 0x73, 0x73, 0x20, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x20, + 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x73, + 0x6a, 0x64, 0x62, 0x49, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x53, 0x61, 0x76, + 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x42, 0x61, 0x73, 0x69, 0x63, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x77, 0x68, 0x69, + 0x63, 0x68, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, + 0x73, 0x61, 0x76, 0x65, 0x20, 0x77, 0x68, 0x65, 0x6e, 0x20, 0x73, 0x6a, + 0x64, 0x62, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, + 0x20, 0x61, 0x72, 0x65, 0x20, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x65, + 0x64, 0x20, 0x6f, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x6c, 0x79, + 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, + 0x69, 0x6e, 0x67, 0x20, 0x73, 0x74, 0x65, 0x70, 0x0a, 0x09, 0x09, 0x09, + 0x09, 0x09, 0x42, 0x61, 0x73, 0x69, 0x63, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x73, 0x6d, 0x61, 0x6c, 0x6c, 0x20, 0x6a, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x2f, 0x20, 0x74, 0x72, + 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x66, 0x69, 0x6c, + 0x65, 0x73, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x41, 0x6c, 0x6c, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x69, 0x6e, + 0x67, 0x20, 0x62, 0x69, 0x67, 0x20, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x2c, 0x20, 0x53, 0x41, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x53, 0x41, 0x69, + 0x6e, 0x64, 0x65, 0x78, 0x20, 0x2d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, + 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x20, + 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x67, + 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, + 0x6f, 0x72, 0x79, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x56, 0x61, 0x72, + 0x69, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, 0x76, 0x61, 0x72, 0x56, 0x43, 0x46, + 0x66, 0x69, 0x6c, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x56, 0x43, 0x46, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x74, 0x68, 0x61, + 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x20, 0x76, + 0x61, 0x72, 0x69, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x64, 0x61, 0x74, + 0x61, 0x2e, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x49, 0x6e, 0x70, 0x75, + 0x74, 0x20, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x0a, 0x69, 0x6e, 0x70, 0x75, + 0x74, 0x42, 0x41, 0x4d, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x42, 0x41, + 0x4d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x2c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x75, 0x73, 0x65, 0x64, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x4d, + 0x6f, 0x64, 0x65, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x41, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x46, 0x72, 0x6f, 0x6d, 0x42, + 0x41, 0x4d, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x52, 0x65, 0x61, 0x64, + 0x20, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, + 0x72, 0x65, 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x54, 0x79, 0x70, + 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x46, 0x61, 0x73, 0x74, 0x78, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x66, 0x6f, + 0x72, 0x6d, 0x61, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x69, 0x6e, 0x70, 0x75, + 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x46, 0x61, 0x73, 0x74, 0x78, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x46, 0x41, 0x53, + 0x54, 0x41, 0x20, 0x6f, 0x72, 0x20, 0x46, 0x41, 0x53, 0x54, 0x51, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x53, 0x45, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x53, 0x41, 0x4d, 0x20, + 0x6f, 0x72, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x73, 0x69, 0x6e, 0x67, 0x6c, + 0x65, 0x2d, 0x65, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x3b, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x75, 0x73, 0x65, + 0x20, 0x2d, 0x2d, 0x72, 0x65, 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, + 0x43, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x61, 0x6d, 0x74, + 0x6f, 0x6f, 0x6c, 0x73, 0x20, 0x76, 0x69, 0x65, 0x77, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x50, 0x45, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x6f, 0x72, + 0x20, 0x42, 0x41, 0x4d, 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, 0x2d, + 0x65, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x3b, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x75, 0x73, 0x65, 0x20, 0x2d, + 0x2d, 0x72, 0x65, 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x43, 0x6f, + 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x61, 0x6d, 0x74, 0x6f, 0x6f, + 0x6c, 0x73, 0x20, 0x76, 0x69, 0x65, 0x77, 0x0a, 0x0a, 0x72, 0x65, 0x61, + 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x49, 0x6e, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x52, 0x65, 0x61, 0x64, 0x31, 0x20, 0x52, 0x65, 0x61, 0x64, 0x32, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, + 0x73, 0x29, 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x73, 0x20, 0x74, 0x6f, + 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x6e, 0x70, 0x75, + 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x31, 0x20, 0x28, 0x61, 0x6e, 0x64, + 0x2c, 0x20, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x65, 0x64, 0x2c, + 0x20, 0x20, 0x72, 0x65, 0x61, 0x64, 0x32, 0x29, 0x0a, 0x0a, 0x72, 0x65, + 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x50, 0x72, 0x65, 0x66, 0x69, + 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x72, 0x65, 0x69, 0x66, 0x78, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, + 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x73, 0x2c, + 0x20, 0x69, 0x2e, 0x65, 0x2e, 0x20, 0x69, 0x74, 0x20, 0x77, 0x69, 0x6c, + 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x69, + 0x6e, 0x20, 0x66, 0x72, 0x6f, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x69, + 0x6e, 0x20, 0x2d, 0x2d, 0x72, 0x65, 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, + 0x73, 0x49, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x3a, 0x20, 0x6e, + 0x6f, 0x20, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x0a, 0x0a, 0x72, 0x65, + 0x61, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x43, 0x6f, 0x6d, 0x6d, 0x61, + 0x6e, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x63, 0x6f, 0x6d, 0x6d, + 0x61, 0x6e, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x74, 0x6f, 0x20, + 0x65, 0x78, 0x65, 0x63, 0x75, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x65, 0x61, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x2e, 0x20, + 0x54, 0x68, 0x69, 0x73, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, + 0x20, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x20, 0x67, 0x65, 0x6e, 0x65, + 0x72, 0x61, 0x74, 0x65, 0x20, 0x46, 0x41, 0x53, 0x54, 0x41, 0x20, 0x6f, + 0x72, 0x20, 0x46, 0x41, 0x53, 0x54, 0x51, 0x20, 0x74, 0x65, 0x78, 0x74, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x69, 0x74, + 0x20, 0x74, 0x6f, 0x20, 0x73, 0x74, 0x64, 0x6f, 0x75, 0x74, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x46, 0x6f, 0x72, 0x20, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, + 0x65, 0x3a, 0x20, 0x7a, 0x63, 0x61, 0x74, 0x20, 0x2d, 0x20, 0x74, 0x6f, + 0x20, 0x75, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x20, + 0x2e, 0x67, 0x7a, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2c, 0x20, 0x62, + 0x7a, 0x63, 0x61, 0x74, 0x20, 0x2d, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x6e, + 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x20, 0x2e, 0x62, 0x7a, + 0x32, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2c, 0x20, 0x65, 0x74, 0x63, + 0x2e, 0x0a, 0x0a, 0x72, 0x65, 0x61, 0x64, 0x4d, 0x61, 0x70, 0x4e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x74, + 0x6f, 0x20, 0x6d, 0x61, 0x70, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x62, 0x65, 0x67, 0x69, 0x6e, 0x6e, 0x69, 0x6e, 0x67, + 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x3a, 0x20, 0x6d, 0x61, 0x70, + 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x0a, 0x0a, + 0x72, 0x65, 0x61, 0x64, 0x4d, 0x61, 0x74, 0x65, 0x73, 0x4c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x73, 0x49, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x74, 0x45, 0x71, 0x75, 0x61, 0x6c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, + 0x20, 0x45, 0x71, 0x75, 0x61, 0x6c, 0x2f, 0x4e, 0x6f, 0x74, 0x45, 0x71, + 0x75, 0x61, 0x6c, 0x20, 0x2d, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x73, 0x20, 0x6f, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x73, 0x2c, 0x73, + 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x73, 0x2c, 0x71, 0x75, 0x61, + 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x62, + 0x6f, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x72, + 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x61, 0x6d, 0x65, 0x20, 0x20, + 0x2f, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x61, + 0x6d, 0x65, 0x2e, 0x20, 0x4e, 0x6f, 0x74, 0x45, 0x71, 0x75, 0x61, 0x6c, + 0x20, 0x69, 0x73, 0x20, 0x73, 0x61, 0x66, 0x65, 0x20, 0x69, 0x6e, 0x20, + 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x69, 0x74, 0x75, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x2e, 0x0a, 0x0a, 0x72, 0x65, 0x61, 0x64, 0x4e, 0x61, 0x6d, + 0x65, 0x53, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, + 0x3a, 0x20, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x28, + 0x73, 0x29, 0x20, 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x69, 0x6e, + 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x61, 0x72, 0x74, 0x20, 0x6f, + 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x77, 0x69, + 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x74, 0x72, 0x69, 0x6d, 0x6d, 0x65, + 0x64, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, + 0x28, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x61, + 0x66, 0x74, 0x65, 0x72, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, + 0x73, 0x20, 0x61, 0x6c, 0x77, 0x61, 0x79, 0x73, 0x20, 0x74, 0x72, 0x69, + 0x6d, 0x6d, 0x65, 0x64, 0x29, 0x0a, 0x0a, 0x63, 0x6c, 0x69, 0x70, 0x33, + 0x70, 0x4e, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x73, 0x29, + 0x3a, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x28, 0x73, 0x29, 0x20, + 0x6f, 0x66, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, + 0x63, 0x6c, 0x69, 0x70, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x33, 0x70, + 0x20, 0x6f, 0x66, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6d, 0x61, 0x74, + 0x65, 0x2e, 0x20, 0x49, 0x66, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x69, 0x73, 0x20, 0x67, 0x69, 0x76, 0x65, 0x6e, + 0x2c, 0x20, 0x69, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, + 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x73, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x62, 0x6f, + 0x74, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x0a, 0x0a, 0x63, + 0x6c, 0x69, 0x70, 0x35, 0x70, 0x4e, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, + 0x28, 0x73, 0x29, 0x20, 0x6f, 0x66, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73, + 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6c, 0x69, 0x70, 0x20, 0x66, 0x72, 0x6f, + 0x6d, 0x20, 0x35, 0x70, 0x20, 0x6f, 0x66, 0x20, 0x65, 0x61, 0x63, 0x68, + 0x20, 0x6d, 0x61, 0x74, 0x65, 0x2e, 0x20, 0x49, 0x66, 0x20, 0x6f, 0x6e, + 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x69, 0x73, 0x20, 0x67, + 0x69, 0x76, 0x65, 0x6e, 0x2c, 0x20, 0x69, 0x74, 0x20, 0x77, 0x69, 0x6c, + 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x62, 0x6f, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, + 0x2e, 0x0a, 0x0a, 0x63, 0x6c, 0x69, 0x70, 0x33, 0x70, 0x41, 0x64, 0x61, + 0x70, 0x74, 0x65, 0x72, 0x53, 0x65, 0x71, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, + 0x61, 0x64, 0x61, 0x70, 0x74, 0x65, 0x72, 0x20, 0x73, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x63, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6c, 0x69, + 0x70, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x33, 0x70, 0x20, 0x6f, 0x66, + 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x2e, 0x20, + 0x20, 0x49, 0x66, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x20, 0x69, 0x73, 0x20, 0x67, 0x69, 0x76, 0x65, 0x6e, 0x2c, 0x20, + 0x69, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, + 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, + 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x62, 0x6f, 0x74, 0x68, + 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x0a, 0x0a, 0x63, 0x6c, 0x69, + 0x70, 0x33, 0x70, 0x41, 0x64, 0x61, 0x70, 0x74, 0x65, 0x72, 0x4d, 0x4d, + 0x70, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x2e, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x75, + 0x62, 0x6c, 0x65, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, + 0x70, 0x72, 0x6f, 0x70, 0x6f, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, + 0x66, 0x20, 0x6d, 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x33, 0x70, 0x20, 0x61, 0x64, 0x70, 0x61, + 0x74, 0x65, 0x72, 0x20, 0x63, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6d, 0x61, + 0x74, 0x65, 0x2e, 0x20, 0x20, 0x49, 0x66, 0x20, 0x6f, 0x6e, 0x65, 0x20, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x69, 0x73, 0x20, 0x67, 0x69, 0x76, + 0x65, 0x6e, 0x2c, 0x20, 0x69, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, + 0x62, 0x65, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x61, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x62, 0x6f, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x0a, + 0x0a, 0x63, 0x6c, 0x69, 0x70, 0x33, 0x70, 0x41, 0x66, 0x74, 0x65, 0x72, + 0x41, 0x64, 0x61, 0x70, 0x74, 0x65, 0x72, 0x4e, 0x62, 0x61, 0x73, 0x65, + 0x73, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x20, 0x6f, 0x66, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, 0x74, + 0x6f, 0x20, 0x63, 0x6c, 0x69, 0x70, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, + 0x33, 0x70, 0x20, 0x6f, 0x66, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6d, + 0x61, 0x74, 0x65, 0x20, 0x61, 0x66, 0x74, 0x65, 0x72, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x61, 0x64, 0x61, 0x70, 0x74, 0x65, 0x72, 0x20, 0x63, 0x6c, + 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x2e, 0x20, 0x49, 0x66, 0x20, 0x6f, + 0x6e, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x69, 0x73, 0x20, + 0x67, 0x69, 0x76, 0x65, 0x6e, 0x2c, 0x20, 0x69, 0x74, 0x20, 0x77, 0x69, + 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, + 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x61, 0x6d, 0x65, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x62, 0x6f, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x74, 0x65, + 0x73, 0x2e, 0x0a, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x4c, 0x69, 0x6d, + 0x69, 0x74, 0x73, 0x0a, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x47, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x52, + 0x41, 0x4d, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, + 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x52, 0x41, + 0x4d, 0x20, 0x28, 0x62, 0x79, 0x74, 0x65, 0x73, 0x29, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x0a, 0x6c, 0x69, 0x6d, + 0x69, 0x74, 0x49, 0x4f, 0x62, 0x75, 0x66, 0x66, 0x65, 0x72, 0x53, 0x69, + 0x7a, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x35, + 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x61, + 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x62, 0x75, 0x66, + 0x66, 0x65, 0x72, 0x73, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x28, 0x62, + 0x79, 0x74, 0x65, 0x73, 0x29, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x2f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x2c, 0x20, + 0x70, 0x65, 0x72, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x0a, 0x0a, + 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x4f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x6f, + 0x6e, 0x65, 0x52, 0x65, 0x61, 0x64, 0x42, 0x79, 0x74, 0x65, 0x73, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x73, + 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, + 0x41, 0x4d, 0x20, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x20, 0x28, 0x62, + 0x79, 0x74, 0x65, 0x73, 0x29, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6f, 0x6e, + 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, + 0x6d, 0x6d, 0x65, 0x6e, 0x64, 0x65, 0x64, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3a, 0x20, 0x3e, 0x28, 0x32, 0x2a, 0x28, 0x4c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x4d, 0x61, 0x74, 0x65, 0x31, 0x2b, 0x4c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x4d, 0x61, 0x74, 0x65, 0x32, 0x2b, 0x31, 0x30, 0x30, 0x29, + 0x2a, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x75, + 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x4e, 0x6d, 0x61, 0x78, 0x0a, 0x0a, + 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x4f, 0x75, 0x74, 0x53, 0x4a, 0x6f, 0x6e, + 0x65, 0x52, 0x65, 0x61, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6f, 0x6e, 0x65, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x28, 0x69, 0x6e, 0x63, 0x6c, 0x75, + 0x64, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x6d, 0x75, 0x6c, + 0x74, 0x69, 0x2d, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x72, 0x73, 0x29, 0x0a, + 0x0a, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x4f, 0x75, 0x74, 0x53, 0x4a, 0x63, + 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, + 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x63, + 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x6c, 0x69, 0x6d, 0x69, + 0x74, 0x42, 0x41, 0x4d, 0x73, 0x6f, 0x72, 0x74, 0x52, 0x41, 0x4d, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, + 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x61, 0x76, + 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x52, 0x41, 0x4d, 0x20, + 0x28, 0x62, 0x79, 0x74, 0x65, 0x73, 0x29, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x73, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x42, 0x41, 0x4d, 0x2e, + 0x20, 0x49, 0x66, 0x20, 0x3d, 0x30, 0x2c, 0x20, 0x69, 0x74, 0x20, 0x77, + 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x73, 0x65, 0x74, 0x20, 0x74, + 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2e, + 0x20, 0x30, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x63, 0x61, 0x6e, + 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x62, 0x65, 0x20, 0x75, 0x73, 0x65, + 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x20, 0x4e, 0x6f, 0x53, 0x68, + 0x61, 0x72, 0x65, 0x64, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x6f, + 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x0a, 0x0a, 0x6c, 0x69, 0x6d, 0x69, + 0x74, 0x53, 0x6a, 0x64, 0x62, 0x49, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x4e, + 0x73, 0x6a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, + 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, + 0x75, 0x6d, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6f, + 0x20, 0x62, 0x65, 0x20, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x65, 0x64, + 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, + 0x6d, 0x65, 0x20, 0x6f, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x6c, + 0x79, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x70, + 0x70, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x74, 0x61, 0x67, 0x65, 0x2c, 0x20, + 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, + 0x6f, 0x73, 0x65, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x61, 0x6e, 0x6e, + 0x6f, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x74, 0x68, 0x6f, 0x73, 0x65, 0x20, 0x64, 0x65, 0x74, 0x65, 0x63, + 0x74, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x31, + 0x73, 0x74, 0x20, 0x73, 0x74, 0x65, 0x70, 0x20, 0x6f, 0x66, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x32, 0x2d, 0x70, 0x61, 0x73, 0x73, 0x20, 0x72, 0x75, + 0x6e, 0x0a, 0x0a, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x4e, 0x72, 0x65, 0x61, + 0x64, 0x73, 0x53, 0x6f, 0x66, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x20, 0x6c, + 0x69, 0x6d, 0x69, 0x74, 0x20, 0x6f, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x73, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x4f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x3a, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, + 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2f, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, + 0x20, 0x28, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x69, 0x6e, 0x67, 0x20, + 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x6f, 0x72, 0x20, 0x72, 0x65, 0x6c, 0x61, + 0x74, 0x69, 0x76, 0x65, 0x20, 0x70, 0x61, 0x74, 0x68, 0x29, 0x2e, 0x20, + 0x43, 0x61, 0x6e, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x62, 0x65, 0x20, + 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x54, 0x6d, 0x70, + 0x44, 0x69, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, + 0x61, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x79, 0x20, + 0x74, 0x68, 0x61, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, + 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6f, 0x72, 0x61, 0x72, 0x79, 0x20, 0x62, 0x79, 0x20, 0x53, 0x54, + 0x41, 0x52, 0x2e, 0x20, 0x41, 0x6c, 0x6c, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, + 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x77, + 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, + 0x65, 0x64, 0x21, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x79, + 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x65, + 0x4e, 0x61, 0x6d, 0x65, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x5f, 0x53, + 0x54, 0x41, 0x52, 0x74, 0x6d, 0x70, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x54, + 0x6d, 0x70, 0x4b, 0x65, 0x65, 0x70, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x77, 0x68, 0x65, + 0x74, 0x68, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x6b, 0x65, 0x65, 0x70, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x70, 0x6f, 0x72, + 0x61, 0x72, 0x79, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x61, 0x66, + 0x74, 0x65, 0x72, 0x20, 0x53, 0x54, 0x41, 0x52, 0x20, 0x72, 0x75, 0x6e, + 0x73, 0x20, 0x69, 0x73, 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, + 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, + 0x6e, 0x65, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, + 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, + 0x61, 0x72, 0x79, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x41, 0x6c, 0x6c, 0x20, 0x2e, 0x2e, + 0x20, 0x6b, 0x65, 0x65, 0x70, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x74, 0x64, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4c, 0x6f, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x6f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, + 0x65, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20, 0x74, + 0x6f, 0x20, 0x73, 0x74, 0x64, 0x6f, 0x75, 0x74, 0x20, 0x28, 0x73, 0x74, + 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x6f, 0x75, 0x74, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4c, 0x6f, 0x67, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6c, + 0x6f, 0x67, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x41, 0x4d, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x69, 0x6e, + 0x20, 0x53, 0x41, 0x4d, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, + 0x28, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x6e, 0x6f, 0x72, 0x6d, 0x61, + 0x6c, 0x6c, 0x79, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x65, + 0x64, 0x2e, 0x6f, 0x75, 0x74, 0x2e, 0x73, 0x61, 0x6d, 0x20, 0x66, 0x69, + 0x6c, 0x65, 0x29, 0x2c, 0x20, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x20, + 0x73, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x67, 0x6f, 0x20, + 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x4c, 0x6f, 0x67, 0x2e, 0x73, 0x74, 0x64, + 0x2e, 0x6f, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x42, 0x41, 0x4d, 0x5f, 0x55, 0x6e, 0x73, 0x6f, 0x72, 0x74, 0x65, + 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x6f, + 0x72, 0x6d, 0x61, 0x74, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x6f, 0x72, 0x74, + 0x65, 0x64, 0x2e, 0x20, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, + 0x20, 0x2d, 0x2d, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x74, 0x79, 0x70, + 0x65, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x55, 0x6e, 0x73, 0x6f, 0x72, 0x74, + 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x42, + 0x41, 0x4d, 0x5f, 0x53, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x42, 0x79, 0x43, + 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, + 0x20, 0x69, 0x6e, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x6f, 0x72, 0x6d, + 0x61, 0x74, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, + 0x2e, 0x20, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, 0x20, 0x2d, + 0x2d, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x74, 0x79, 0x70, 0x65, 0x20, + 0x42, 0x41, 0x4d, 0x20, 0x53, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x42, 0x79, + 0x43, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x42, 0x41, 0x4d, 0x5f, 0x51, + 0x75, 0x61, 0x6e, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x6d, + 0x65, 0x20, 0x69, 0x6e, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x6f, 0x72, + 0x6d, 0x61, 0x74, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x6f, 0x72, 0x74, 0x65, + 0x64, 0x2e, 0x20, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, 0x20, + 0x2d, 0x2d, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x4d, 0x6f, 0x64, 0x65, 0x20, + 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x6d, + 0x65, 0x53, 0x41, 0x4d, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x52, 0x65, 0x61, + 0x64, 0x73, 0x55, 0x6e, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x20, 0x6f, 0x66, 0x20, 0x75, 0x6e, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, + 0x6c, 0x79, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x28, 0x69, + 0x2e, 0x65, 0x2e, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x6f, + 0x6e, 0x6c, 0x79, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x65, + 0x20, 0x6f, 0x66, 0x20, 0x61, 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, + 0x20, 0x65, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x29, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x65, 0x70, 0x61, + 0x72, 0x61, 0x74, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x28, 0x73, 0x29, + 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, + 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, + 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x46, 0x61, 0x73, 0x74, 0x78, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x69, + 0x6e, 0x20, 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x65, 0x20, 0x66, + 0x61, 0x73, 0x74, 0x61, 0x2f, 0x66, 0x61, 0x73, 0x74, 0x71, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x73, 0x2c, 0x20, 0x55, 0x6e, 0x6d, 0x61, 0x70, 0x70, + 0x65, 0x64, 0x2e, 0x6f, 0x75, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x65, 0x31, + 0x2f, 0x32, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x51, 0x53, 0x63, 0x6f, 0x6e, + 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x41, 0x64, 0x64, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x30, 0x0a, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x61, 0x64, + 0x64, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x71, 0x75, 0x61, + 0x6c, 0x69, 0x74, 0x79, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x28, + 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x76, + 0x65, 0x72, 0x74, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x49, 0x6c, 0x6c, + 0x75, 0x6d, 0x69, 0x6e, 0x61, 0x20, 0x74, 0x6f, 0x20, 0x53, 0x61, 0x6e, + 0x67, 0x65, 0x72, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x20, 0x2d, 0x33, 0x31, + 0x29, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, + 0x61, 0x70, 0x70, 0x65, 0x72, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4f, + 0x6c, 0x64, 0x5f, 0x32, 0x2e, 0x34, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, + 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, + 0x70, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, + 0x6e, 0x74, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4f, 0x6c, 0x64, 0x5f, + 0x32, 0x2e, 0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x71, 0x75, 0x61, 0x73, + 0x69, 0x2d, 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x6f, 0x72, 0x64, + 0x65, 0x72, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x62, 0x65, 0x66, 0x6f, + 0x72, 0x65, 0x20, 0x32, 0x2e, 0x35, 0x2e, 0x30, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, + 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x65, 0x61, 0x63, 0x68, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x6d, + 0x61, 0x70, 0x70, 0x65, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x61, 0x64, 0x20, + 0x6d, 0x61, 0x74, 0x65, 0x73, 0x20, 0x28, 0x70, 0x61, 0x69, 0x72, 0x73, + 0x29, 0x20, 0x61, 0x72, 0x65, 0x20, 0x61, 0x6c, 0x77, 0x61, 0x79, 0x73, + 0x20, 0x61, 0x64, 0x6a, 0x61, 0x63, 0x65, 0x6e, 0x74, 0x2c, 0x20, 0x61, + 0x6c, 0x6c, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x73, 0x74, 0x61, 0x79, 0x20, 0x74, 0x6f, 0x67, 0x65, + 0x74, 0x68, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x6f, + 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, + 0x65, 0x63, 0x6f, 0x6d, 0x65, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x75, 0x74, + 0x75, 0x72, 0x65, 0x20, 0x72, 0x65, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x73, + 0x2e, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x4f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x3a, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x42, + 0x41, 0x4d, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x74, 0x79, 0x70, + 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, + 0x41, 0x4d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, + 0x67, 0x73, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, 0x20, + 0x53, 0x41, 0x4d, 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, + 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x3a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x42, 0x41, 0x4d, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x6f, 0x75, 0x74, 0x20, 0x73, 0x6f, 0x72, + 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x53, 0x41, 0x4d, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x6f, 0x75, 0x74, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, + 0x65, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x53, 0x41, 0x4d, + 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x6e, 0x64, 0x2c, + 0x20, 0x33, 0x72, 0x64, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x55, 0x6e, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x73, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x75, 0x6e, + 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x53, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x42, 0x79, 0x43, + 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, + 0x63, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x2e, 0x20, + 0x54, 0x68, 0x69, 0x73, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, + 0x65, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x6d, 0x65, 0x6d, 0x6f, + 0x72, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x69, + 0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x63, 0x61, 0x6e, + 0x20, 0x62, 0x65, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, + 0x64, 0x20, 0x62, 0x79, 0x20, 0x2d, 0x2d, 0x6c, 0x69, 0x6d, 0x69, 0x74, + 0x42, 0x41, 0x4d, 0x73, 0x6f, 0x72, 0x74, 0x52, 0x41, 0x4d, 0x2e, 0x0a, + 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x6d, 0x6f, 0x64, 0x65, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x46, 0x75, 0x6c, + 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x53, 0x41, + 0x4d, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x46, 0x75, 0x6c, 0x6c, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x75, 0x6c, + 0x6c, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x51, + 0x53, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x53, + 0x41, 0x4d, 0x20, 0x62, 0x75, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x6f, + 0x75, 0x74, 0x20, 0x71, 0x75, 0x61, 0x6c, 0x69, 0x74, 0x79, 0x20, 0x73, + 0x63, 0x6f, 0x72, 0x65, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, + 0x4d, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x46, 0x69, 0x65, 0x6c, 0x64, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x43, 0x75, 0x66, 0x66, 0x6c, 0x69, 0x6e, 0x6b, 0x73, 0x2d, 0x6c, 0x69, + 0x6b, 0x65, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x66, 0x69, + 0x65, 0x6c, 0x64, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x74, + 0x20, 0x75, 0x73, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x4d, 0x6f, 0x74, 0x69, + 0x66, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, + 0x20, 0x64, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x20, 0x66, 0x72, 0x6f, + 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, + 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2e, 0x20, 0x52, 0x65, 0x61, 0x64, + 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6e, + 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x61, 0x6e, 0x64, 0x2f, + 0x6f, 0x72, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, + 0x69, 0x63, 0x61, 0x6c, 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x73, + 0x20, 0x61, 0x72, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x65, + 0x64, 0x20, 0x6f, 0x75, 0x74, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, + 0x41, 0x4d, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, + 0x20, 0x61, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, + 0x20, 0x64, 0x65, 0x73, 0x69, 0x72, 0x65, 0x64, 0x20, 0x53, 0x41, 0x4d, + 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x2c, + 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x72, 0x64, 0x65, + 0x72, 0x20, 0x64, 0x65, 0x73, 0x69, 0x72, 0x65, 0x64, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x20, 0x53, 0x41, 0x4d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4e, 0x48, 0x20, 0x48, 0x49, 0x20, 0x41, 0x53, 0x20, 0x6e, 0x4d, + 0x20, 0x4e, 0x4d, 0x20, 0x4d, 0x44, 0x20, 0x6a, 0x4d, 0x20, 0x6a, 0x49, + 0x20, 0x58, 0x53, 0x20, 0x4d, 0x43, 0x20, 0x63, 0x68, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x63, 0x6f, 0x6d, 0x62, 0x69, 0x6e, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x61, 0x6e, 0x79, + 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x61, 0x74, + 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, + 0x64, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x4e, 0x48, 0x20, + 0x48, 0x49, 0x20, 0x41, 0x53, 0x20, 0x6e, 0x4d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x41, 0x6c, 0x6c, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x4e, 0x48, 0x20, + 0x48, 0x49, 0x20, 0x41, 0x53, 0x20, 0x6e, 0x4d, 0x20, 0x4e, 0x4d, 0x20, + 0x4d, 0x44, 0x20, 0x6a, 0x4d, 0x20, 0x6a, 0x49, 0x20, 0x4d, 0x43, 0x20, + 0x63, 0x68, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, + 0x41, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x76, 0x61, 0x72, 0x69, 0x61, 0x6e, 0x74, 0x20, 0x61, + 0x6c, 0x6c, 0x65, 0x6c, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x76, 0x47, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x69, + 0x63, 0x20, 0x63, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x61, 0x6e, 0x74, 0x65, + 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x76, 0x61, 0x72, 0x69, + 0x61, 0x6e, 0x74, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x70, + 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, + 0x57, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x30, 0x2f, 0x31, 0x20, 0x2d, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, + 0x6e, 0x6f, 0x74, 0x20, 0x70, 0x61, 0x73, 0x73, 0x20, 0x2f, 0x20, 0x70, + 0x61, 0x73, 0x73, 0x65, 0x73, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x66, + 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x2e, 0x20, 0x52, 0x65, + 0x71, 0x75, 0x69, 0x72, 0x65, 0x73, 0x20, 0x2d, 0x2d, 0x77, 0x61, 0x73, + 0x70, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x4d, 0x6f, 0x64, 0x65, 0x20, + 0x53, 0x41, 0x4d, 0x74, 0x61, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x43, 0x52, 0x20, 0x43, 0x59, 0x20, 0x55, 0x52, 0x20, + 0x55, 0x59, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, + 0x6e, 0x63, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x71, 0x75, 0x61, + 0x6c, 0x69, 0x74, 0x79, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x73, 0x20, + 0x6f, 0x66, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x20, 0x62, 0x61, 0x72, 0x63, + 0x6f, 0x64, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x55, 0x4d, 0x49, + 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x6f, + 0x6c, 0x6f, 0x2a, 0x20, 0x64, 0x65, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x70, + 0x6c, 0x65, 0x78, 0x69, 0x6e, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x55, 0x6e, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, + 0x65, 0x64, 0x2f, 0x75, 0x6e, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, + 0x74, 0x65, 0x64, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x42, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, + 0x6e, 0x74, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x2f, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x69, 0x63, 0x20, 0x63, 0x6f, + 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x73, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, 0x52, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x63, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, + 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x76, 0x61, 0x72, + 0x69, 0x61, 0x6e, 0x74, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, + 0x61, 0x74, 0x74, 0x72, 0x49, 0x48, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, + 0x3d, 0x30, 0x3a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x49, 0x48, 0x20, 0x61, + 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x20, 0x30, 0x20, + 0x6d, 0x61, 0x79, 0x20, 0x62, 0x65, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, + 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x73, 0x6f, 0x6d, 0x65, 0x20, + 0x64, 0x6f, 0x77, 0x6e, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x73, + 0x6f, 0x66, 0x74, 0x77, 0x61, 0x72, 0x65, 0x2c, 0x20, 0x73, 0x75, 0x63, + 0x68, 0x20, 0x61, 0x73, 0x20, 0x43, 0x75, 0x66, 0x66, 0x6c, 0x69, 0x6e, + 0x6b, 0x73, 0x20, 0x6f, 0x72, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x54, 0x69, 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, + 0x75, 0x6e, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x75, 0x6e, 0x6d, 0x61, + 0x70, 0x70, 0x65, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x69, + 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x66, 0x6f, + 0x72, 0x6d, 0x61, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x3a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x57, 0x69, 0x74, 0x68, 0x69, 0x6e, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x75, 0x6e, 0x6d, 0x61, 0x70, 0x70, + 0x65, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x69, 0x6e, + 0x20, 0x53, 0x41, 0x4d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x69, + 0x2e, 0x65, 0x2e, 0x20, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x2e, + 0x6f, 0x75, 0x74, 0x2e, 0x73, 0x61, 0x6d, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x6e, 0x64, 0x20, 0x77, 0x6f, 0x72, + 0x64, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4b, + 0x65, 0x65, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x20, 0x75, 0x6e, 0x6d, 0x61, + 0x70, 0x70, 0x65, 0x64, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x2c, 0x20, 0x69, + 0x6e, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x75, 0x6e, + 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x2c, 0x20, 0x6b, 0x65, 0x65, 0x70, 0x20, 0x69, 0x74, 0x20, 0x61, + 0x64, 0x6a, 0x61, 0x63, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x69, + 0x74, 0x73, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x6d, 0x61, + 0x74, 0x65, 0x2e, 0x20, 0x4f, 0x6e, 0x6c, 0x79, 0x20, 0x61, 0x66, 0x66, + 0x65, 0x63, 0x74, 0x73, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x6d, + 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, + 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x6f, 0x72, 0x64, + 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, + 0x61, 0x69, 0x72, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, + 0x66, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, 0x41, 0x4d, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x50, 0x61, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x6f, 0x6e, 0x65, + 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, 0x61, 0x66, 0x74, 0x65, 0x72, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x61, 0x69, 0x72, + 0x65, 0x64, 0x4b, 0x65, 0x65, 0x70, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x4f, + 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6d, 0x61, + 0x74, 0x65, 0x20, 0x61, 0x66, 0x74, 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, + 0x6c, 0x6c, 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x6b, + 0x65, 0x70, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x61, 0x6d, 0x65, + 0x20, 0x61, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x20, 0x46, 0x41, 0x53, 0x54, 0x51, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, + 0x70, 0x72, 0x69, 0x6d, 0x61, 0x72, 0x79, 0x46, 0x6c, 0x61, 0x67, 0x09, + 0x09, 0x4f, 0x6e, 0x65, 0x42, 0x65, 0x73, 0x74, 0x53, 0x63, 0x6f, 0x72, + 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x61, 0x6c, 0x69, 0x67, + 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x65, 0x64, 0x20, 0x70, 0x72, + 0x69, 0x6d, 0x61, 0x72, 0x79, 0x20, 0x2d, 0x20, 0x61, 0x6c, 0x6c, 0x20, + 0x6f, 0x74, 0x68, 0x65, 0x72, 0x73, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, + 0x62, 0x65, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x65, 0x64, 0x20, 0x77, 0x69, + 0x74, 0x68, 0x20, 0x30, 0x78, 0x31, 0x30, 0x30, 0x20, 0x62, 0x69, 0x74, + 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x46, 0x4c, 0x41, 0x47, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4f, 0x6e, 0x65, + 0x42, 0x65, 0x73, 0x74, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x65, 0x73, 0x74, 0x20, 0x73, + 0x63, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x73, 0x20, 0x70, 0x72, 0x69, 0x6d, + 0x61, 0x72, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x41, 0x6c, 0x6c, 0x42, 0x65, 0x73, 0x74, 0x53, 0x63, 0x6f, 0x72, 0x65, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x65, 0x73, 0x74, 0x20, 0x73, 0x63, + 0x6f, 0x72, 0x65, 0x20, 0x61, 0x72, 0x65, 0x20, 0x70, 0x72, 0x69, 0x6d, + 0x61, 0x72, 0x79, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x72, + 0x65, 0x61, 0x64, 0x49, 0x44, 0x09, 0x09, 0x09, 0x53, 0x74, 0x61, 0x6e, + 0x64, 0x61, 0x72, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x49, 0x44, + 0x20, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x20, 0x74, 0x79, 0x70, 0x65, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x74, 0x61, + 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x69, + 0x72, 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x20, 0x28, 0x75, 0x6e, + 0x74, 0x69, 0x6c, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x46, 0x41, 0x53, 0x54, + 0x78, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x49, 0x44, 0x20, 0x6c, 0x69, + 0x6e, 0x65, 0x2c, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x69, 0x6e, 0x67, + 0x20, 0x2f, 0x31, 0x2c, 0x2f, 0x32, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, + 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x46, 0x41, 0x53, 0x54, + 0x78, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, + 0x41, 0x4d, 0x6d, 0x61, 0x70, 0x71, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x35, 0x35, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x30, 0x20, 0x74, + 0x6f, 0x20, 0x32, 0x35, 0x35, 0x3a, 0x20, 0x74, 0x68, 0x65, 0x20, 0x4d, + 0x41, 0x50, 0x51, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x6d, 0x61, 0x70, + 0x70, 0x65, 0x72, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, + 0x66, 0x6c, 0x61, 0x67, 0x4f, 0x52, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3a, 0x20, 0x30, 0x20, 0x74, 0x6f, 0x20, 0x36, 0x35, 0x35, + 0x33, 0x35, 0x3a, 0x20, 0x73, 0x61, 0x6d, 0x20, 0x46, 0x4c, 0x41, 0x47, + 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x62, 0x69, 0x74, + 0x77, 0x69, 0x73, 0x65, 0x20, 0x4f, 0x52, 0x27, 0x64, 0x20, 0x77, 0x69, + 0x74, 0x68, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2c, 0x20, 0x69, 0x2e, 0x65, 0x2e, 0x20, 0x46, 0x4c, 0x41, 0x47, + 0x3d, 0x46, 0x4c, 0x41, 0x47, 0x20, 0x7c, 0x20, 0x6f, 0x75, 0x74, 0x53, + 0x41, 0x4d, 0x66, 0x6c, 0x61, 0x67, 0x4f, 0x52, 0x2e, 0x20, 0x54, 0x68, + 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x65, + 0x64, 0x20, 0x61, 0x66, 0x74, 0x65, 0x72, 0x20, 0x61, 0x6c, 0x6c, 0x20, + 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x62, + 0x65, 0x65, 0x6e, 0x20, 0x73, 0x65, 0x74, 0x20, 0x62, 0x79, 0x20, 0x53, + 0x54, 0x41, 0x52, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x66, 0x74, + 0x65, 0x72, 0x20, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x66, 0x6c, 0x61, + 0x67, 0x41, 0x4e, 0x44, 0x2e, 0x20, 0x43, 0x61, 0x6e, 0x20, 0x62, 0x65, + 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x74, + 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x63, 0x20, 0x62, 0x69, + 0x74, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x61, 0x72, 0x65, 0x20, + 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x65, 0x74, 0x20, 0x6f, 0x74, 0x68, 0x65, + 0x72, 0x77, 0x69, 0x73, 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, + 0x41, 0x4d, 0x66, 0x6c, 0x61, 0x67, 0x41, 0x4e, 0x44, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x36, 0x35, 0x35, 0x33, + 0x35, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x30, + 0x20, 0x74, 0x6f, 0x20, 0x36, 0x35, 0x35, 0x33, 0x35, 0x3a, 0x20, 0x73, + 0x61, 0x6d, 0x20, 0x46, 0x4c, 0x41, 0x47, 0x20, 0x77, 0x69, 0x6c, 0x6c, + 0x20, 0x62, 0x65, 0x20, 0x62, 0x69, 0x74, 0x77, 0x69, 0x73, 0x65, 0x20, + 0x41, 0x4e, 0x44, 0x27, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, + 0x2e, 0x65, 0x2e, 0x20, 0x46, 0x4c, 0x41, 0x47, 0x3d, 0x46, 0x4c, 0x41, + 0x47, 0x20, 0x26, 0x20, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x66, 0x6c, + 0x61, 0x67, 0x4f, 0x52, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, + 0x73, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x65, 0x64, 0x20, 0x61, 0x66, + 0x74, 0x65, 0x72, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x66, 0x6c, 0x61, 0x67, + 0x73, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x62, 0x65, 0x65, 0x6e, 0x20, + 0x73, 0x65, 0x74, 0x20, 0x62, 0x79, 0x20, 0x53, 0x54, 0x41, 0x52, 0x2c, + 0x20, 0x62, 0x75, 0x74, 0x20, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x20, + 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x66, 0x6c, 0x61, 0x67, 0x4f, 0x52, + 0x2e, 0x20, 0x43, 0x61, 0x6e, 0x20, 0x62, 0x65, 0x20, 0x75, 0x73, 0x65, + 0x64, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x6e, 0x73, 0x65, 0x74, 0x20, 0x73, + 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x63, 0x20, 0x62, 0x69, 0x74, 0x73, + 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6e, 0x6f, + 0x74, 0x20, 0x73, 0x65, 0x74, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x77, + 0x69, 0x73, 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, + 0x61, 0x74, 0x74, 0x72, 0x52, 0x47, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x53, + 0x41, 0x4d, 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x20, + 0x54, 0x68, 0x65, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x77, 0x6f, + 0x72, 0x64, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x67, 0x72, 0x6f, + 0x75, 0x70, 0x20, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x66, 0x69, 0x65, + 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6d, 0x75, 0x73, 0x74, 0x20, 0x73, + 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x22, 0x49, + 0x44, 0x3a, 0x22, 0x2c, 0x20, 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x2d, 0x2d, + 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x61, 0x74, 0x74, 0x72, 0x52, 0x47, + 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x44, 0x3a, 0x78, 0x78, 0x78, 0x20, + 0x43, 0x4e, 0x3a, 0x79, 0x79, 0x20, 0x22, 0x44, 0x53, 0x3a, 0x7a, 0x20, + 0x7a, 0x20, 0x7a, 0x22, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x78, 0x78, 0x78, 0x20, 0x77, 0x69, + 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, + 0x61, 0x73, 0x20, 0x52, 0x47, 0x20, 0x74, 0x61, 0x67, 0x20, 0x74, 0x6f, + 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x20, + 0x41, 0x6e, 0x79, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x20, 0x69, + 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x61, 0x67, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x73, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x74, 0x6f, + 0x20, 0x62, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x71, + 0x75, 0x6f, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x43, 0x6f, 0x6d, 0x6d, 0x61, + 0x20, 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x65, 0x64, 0x20, 0x52, + 0x47, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x63, 0x6f, 0x72, 0x72, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x69, + 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6d, + 0x6d, 0x61, 0x20, 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x65, 0x64, + 0x29, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x73, 0x20, 0x69, 0x6e, 0x20, 0x2d, 0x2d, 0x72, 0x65, 0x61, 0x64, 0x46, + 0x69, 0x6c, 0x65, 0x73, 0x49, 0x6e, 0x2e, 0x20, 0x43, 0x6f, 0x6d, 0x6d, + 0x61, 0x73, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x62, + 0x65, 0x20, 0x73, 0x75, 0x72, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x65, 0x64, + 0x20, 0x62, 0x79, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x2c, 0x20, + 0x65, 0x2e, 0x67, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6f, 0x75, 0x74, 0x53, 0x41, + 0x4d, 0x61, 0x74, 0x74, 0x72, 0x52, 0x47, 0x6c, 0x69, 0x6e, 0x65, 0x20, + 0x49, 0x44, 0x3a, 0x78, 0x78, 0x78, 0x20, 0x2c, 0x20, 0x49, 0x44, 0x3a, + 0x7a, 0x7a, 0x7a, 0x20, 0x22, 0x44, 0x53, 0x3a, 0x7a, 0x20, 0x7a, 0x22, + 0x20, 0x2c, 0x20, 0x49, 0x44, 0x3a, 0x79, 0x79, 0x79, 0x20, 0x44, 0x53, + 0x3a, 0x79, 0x79, 0x79, 0x79, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x41, + 0x4d, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x48, 0x44, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x73, 0x3a, 0x20, 0x40, 0x48, + 0x44, 0x20, 0x28, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x29, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, + 0x41, 0x4d, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x0a, 0x0a, 0x6f, + 0x75, 0x74, 0x53, 0x41, 0x4d, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x50, + 0x47, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x73, + 0x3a, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x40, 0x50, 0x47, 0x20, + 0x28, 0x73, 0x6f, 0x66, 0x74, 0x77, 0x61, 0x72, 0x65, 0x29, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, + 0x41, 0x4d, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, + 0x6e, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, + 0x6f, 0x20, 0x53, 0x54, 0x41, 0x52, 0x29, 0x0a, 0x0a, 0x6f, 0x75, 0x74, + 0x53, 0x41, 0x4d, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x43, 0x6f, 0x6d, + 0x6d, 0x65, 0x6e, 0x74, 0x46, 0x69, 0x6c, 0x65, 0x20, 0x2d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x70, + 0x61, 0x74, 0x68, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x40, 0x43, 0x4f, + 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x6d, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x53, 0x41, 0x4d, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x0a, 0x0a, + 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x20, + 0x53, 0x41, 0x4d, 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4b, 0x65, 0x65, 0x70, 0x4f, 0x6e, 0x6c, 0x79, 0x41, 0x64, + 0x64, 0x65, 0x64, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, + 0x73, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x6b, + 0x65, 0x65, 0x70, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, + 0x61, 0x6c, 0x6c, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x72, 0x65, 0x66, 0x65, + 0x72, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x63, 0x65, 0x73, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x77, 0x69, + 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x46, + 0x61, 0x73, 0x74, 0x61, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x61, 0x74, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, + 0x20, 0x73, 0x74, 0x61, 0x67, 0x65, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4b, 0x65, 0x65, 0x70, + 0x41, 0x6c, 0x6c, 0x41, 0x64, 0x64, 0x65, 0x64, 0x52, 0x65, 0x66, 0x65, + 0x72, 0x65, 0x6e, 0x63, 0x65, 0x73, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x20, + 0x6b, 0x65, 0x65, 0x70, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x72, 0x65, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, + 0x6e, 0x63, 0x65, 0x73, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x77, + 0x69, 0x74, 0x68, 0x20, 0x2d, 0x2d, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x46, 0x61, 0x73, 0x74, 0x61, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x20, 0x61, + 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, + 0x67, 0x20, 0x73, 0x74, 0x61, 0x67, 0x65, 0x2e, 0x0a, 0x0a, 0x0a, 0x6f, + 0x75, 0x74, 0x53, 0x41, 0x4d, 0x6d, 0x75, 0x6c, 0x74, 0x4e, 0x6d, 0x61, + 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, + 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6d, + 0x61, 0x78, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, + 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x70, 0x6c, 0x65, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x61, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x74, 0x68, 0x61, 0x74, + 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, + 0x41, 0x4d, 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, + 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2d, 0x31, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x6c, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, + 0x28, 0x75, 0x70, 0x20, 0x74, 0x6f, 0x20, 0x2d, 0x2d, 0x6f, 0x75, 0x74, + 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, + 0x61, 0x70, 0x4e, 0x6d, 0x61, 0x78, 0x29, 0x20, 0x77, 0x69, 0x6c, 0x6c, + 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, 0x0a, + 0x6f, 0x75, 0x74, 0x53, 0x41, 0x4d, 0x74, 0x6c, 0x65, 0x6e, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x63, + 0x61, 0x6c, 0x63, 0x75, 0x6c, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, + 0x65, 0x74, 0x68, 0x6f, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x54, 0x4c, 0x45, 0x4e, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, + 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x53, 0x41, 0x4d, 0x2f, + 0x42, 0x41, 0x4d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6d, 0x6f, 0x73, 0x74, + 0x20, 0x62, 0x61, 0x73, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x28, 0x2b, 0x29, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x6d, + 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x69, 0x67, 0x68, 0x74, + 0x6d, 0x6f, 0x73, 0x74, 0x20, 0x62, 0x61, 0x73, 0x65, 0x20, 0x6f, 0x66, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x28, 0x2d, 0x29, 0x6d, 0x61, 0x74, 0x65, + 0x2e, 0x20, 0x28, 0x2b, 0x29, 0x73, 0x69, 0x67, 0x6e, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x28, 0x2b, 0x29, 0x73, 0x74, 0x72, + 0x61, 0x6e, 0x64, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6d, 0x6f, 0x73, 0x74, 0x20, + 0x62, 0x61, 0x73, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x6e, 0x79, 0x20, + 0x6d, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x69, 0x67, 0x68, + 0x74, 0x6d, 0x6f, 0x73, 0x74, 0x20, 0x62, 0x61, 0x73, 0x65, 0x20, 0x6f, + 0x66, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x2e, 0x20, + 0x28, 0x2b, 0x29, 0x73, 0x69, 0x67, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6d, 0x6f, + 0x73, 0x74, 0x20, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x20, 0x54, 0x68, 0x69, + 0x73, 0x20, 0x69, 0x73, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, + 0x6e, 0x74, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x31, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x70, 0x69, 0x6e, + 0x67, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x70, 0x72, 0x6f, 0x74, 0x72, 0x75, 0x64, 0x69, 0x6e, 0x67, 0x20, + 0x65, 0x6e, 0x64, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x42, 0x41, 0x4d, + 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x2d, 0x31, 0x20, 0x74, 0x6f, 0x20, 0x31, + 0x30, 0x20, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x72, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x65, 0x76, 0x65, 0x6c, + 0x2c, 0x20, 0x2d, 0x31, 0x3d, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x20, 0x28, 0x36, 0x3f, 0x29, 0x2c, 0x20, 0x30, 0x3d, 0x6e, 0x6f, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2c, + 0x20, 0x31, 0x30, 0x3d, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x0a, + 0x0a, 0x6f, 0x75, 0x74, 0x42, 0x41, 0x4d, 0x73, 0x6f, 0x72, 0x74, 0x69, + 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x4e, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, + 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, + 0x6f, 0x66, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x69, + 0x6e, 0x67, 0x2e, 0x20, 0x30, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x64, + 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x69, + 0x6e, 0x28, 0x36, 0x2c, 0x2d, 0x2d, 0x72, 0x75, 0x6e, 0x54, 0x68, 0x72, + 0x65, 0x61, 0x64, 0x4e, 0x29, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x42, + 0x41, 0x4d, 0x73, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x42, 0x69, 0x6e, + 0x73, 0x4e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x35, 0x30, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x3e, 0x30, 0x3a, 0x20, + 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x67, + 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, 0x62, 0x69, 0x6e, 0x73, 0x20, 0x66, + 0x6f, 0x20, 0x63, 0x6f, 0x6f, 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, + 0x2d, 0x73, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x23, 0x23, + 0x23, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x73, + 0x73, 0x69, 0x6e, 0x67, 0x0a, 0x62, 0x61, 0x6d, 0x52, 0x65, 0x6d, 0x6f, + 0x76, 0x65, 0x44, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, + 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x6d, 0x61, 0x72, 0x6b, + 0x20, 0x64, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, + 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x2c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6e, 0x6f, 0x77, + 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x20, + 0x77, 0x69, 0x74, 0x68, 0x20, 0x28, 0x69, 0x29, 0x20, 0x73, 0x6f, 0x72, + 0x74, 0x65, 0x64, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x65, 0x64, 0x20, + 0x77, 0x69, 0x74, 0x68, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x42, 0x41, + 0x4d, 0x66, 0x69, 0x6c, 0x65, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x28, + 0x69, 0x69, 0x29, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x69, 0x72, + 0x65, 0x64, 0x2d, 0x65, 0x6e, 0x64, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x64, 0x75, 0x70, 0x6c, 0x69, 0x63, + 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x61, 0x6c, 0x2f, + 0x6d, 0x61, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x55, 0x6e, 0x69, 0x71, + 0x75, 0x65, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x63, 0x61, 0x6c, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x6d, 0x61, 0x72, 0x6b, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x6d, 0x75, 0x6c, + 0x74, 0x69, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x72, 0x73, 0x2c, 0x20, 0x61, + 0x6e, 0x64, 0x20, 0x64, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, + 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, + 0x65, 0x72, 0x73, 0x2e, 0x20, 0x54, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6f, + 0x72, 0x64, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x73, 0x2c, 0x20, 0x46, 0x4c, + 0x41, 0x47, 0x2c, 0x20, 0x43, 0x49, 0x47, 0x41, 0x52, 0x20, 0x6d, 0x75, + 0x73, 0x74, 0x20, 0x62, 0x65, 0x20, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x69, + 0x63, 0x61, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x49, 0x64, + 0x65, 0x6e, 0x74, 0x69, 0x63, 0x61, 0x6c, 0x4e, 0x6f, 0x74, 0x4d, 0x75, + 0x6c, 0x74, 0x69, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6d, 0x61, 0x72, + 0x6b, 0x20, 0x64, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, 0x20, + 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, + 0x72, 0x73, 0x20, 0x62, 0x75, 0x74, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6d, + 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x72, 0x73, 0x2e, + 0x0a, 0x0a, 0x62, 0x61, 0x6d, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x44, + 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x4d, 0x61, 0x74, + 0x65, 0x32, 0x62, 0x61, 0x73, 0x65, 0x73, 0x4e, 0x20, 0x20, 0x20, 0x30, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x62, 0x61, + 0x73, 0x65, 0x73, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x35, 0x27, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, + 0x32, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x69, 0x6e, 0x20, + 0x63, 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x28, + 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x52, 0x41, 0x4d, + 0x50, 0x41, 0x47, 0x45, 0x29, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x4f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x57, 0x69, 0x67, 0x67, 0x6c, 0x65, + 0x0a, 0x6f, 0x75, 0x74, 0x57, 0x69, 0x67, 0x54, 0x79, 0x70, 0x65, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, + 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x28, 0x73, 0x29, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, + 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x2c, 0x20, 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x22, 0x62, 0x65, + 0x64, 0x47, 0x72, 0x61, 0x70, 0x68, 0x22, 0x20, 0x4f, 0x52, 0x20, 0x22, + 0x62, 0x65, 0x64, 0x47, 0x72, 0x61, 0x70, 0x68, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x31, 0x5f, 0x35, 0x70, 0x22, 0x2e, 0x20, 0x52, 0x65, 0x71, 0x75, + 0x69, 0x72, 0x65, 0x73, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, + 0x42, 0x41, 0x4d, 0x3a, 0x20, 0x2d, 0x2d, 0x6f, 0x75, 0x74, 0x53, 0x41, + 0x4d, 0x74, 0x79, 0x70, 0x65, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x53, 0x6f, + 0x72, 0x74, 0x65, 0x64, 0x42, 0x79, 0x43, 0x6f, 0x6f, 0x72, 0x64, 0x69, + 0x6e, 0x61, 0x74, 0x65, 0x20, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x31, 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x3a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, + 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x6e, 0x6f, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x62, 0x65, 0x64, 0x47, 0x72, 0x61, 0x70, 0x68, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x62, 0x65, 0x64, 0x47, 0x72, 0x61, 0x70, 0x68, + 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x77, 0x69, 0x67, 0x67, 0x6c, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x77, 0x69, 0x67, 0x67, 0x6c, + 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x6e, 0x64, 0x20, 0x77, 0x6f, 0x72, + 0x64, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x31, 0x5f, 0x35, 0x70, 0x20, 0x20, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x20, 0x66, 0x72, 0x6f, + 0x6d, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x35, 0x27, 0x20, 0x6f, 0x66, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x31, 0x73, 0x74, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x43, 0x41, 0x47, 0x45, 0x2f, 0x52, 0x41, 0x4d, 0x50, 0x41, + 0x47, 0x45, 0x20, 0x65, 0x74, 0x63, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x64, 0x32, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, + 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x32, + 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x0a, 0x0a, 0x6f, 0x75, 0x74, + 0x57, 0x69, 0x67, 0x53, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x65, + 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x6e, 0x65, + 0x73, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x77, 0x69, 0x67, 0x67, 0x6c, 0x65, + 0x2f, 0x62, 0x65, 0x64, 0x47, 0x72, 0x61, 0x70, 0x68, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x53, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x20, 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, + 0x65, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x2c, 0x20, 0x73, + 0x74, 0x72, 0x31, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x74, 0x72, 0x32, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x55, 0x6e, 0x73, + 0x74, 0x72, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x20, 0x63, 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x73, + 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x57, + 0x69, 0x67, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x73, + 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, + 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, + 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x69, + 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x77, 0x69, 0x67, + 0x67, 0x6c, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x2c, 0x20, 0x65, 0x2e, + 0x67, 0x2e, 0x20, 0x22, 0x63, 0x68, 0x72, 0x22, 0x2c, 0x20, 0x64, 0x65, + 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x2d, 0x20, + 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, + 0x72, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x73, 0x0a, 0x0a, + 0x6f, 0x75, 0x74, 0x57, 0x69, 0x67, 0x4e, 0x6f, 0x72, 0x6d, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x52, 0x50, 0x4d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, 0x20, + 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x69, + 0x67, 0x6e, 0x61, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, 0x50, 0x4d, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x70, 0x65, + 0x72, 0x20, 0x6d, 0x69, 0x6c, 0x6c, 0x69, 0x6f, 0x6e, 0x20, 0x6f, 0x66, + 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x6e, 0x6f, 0x20, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2c, 0x20, 0x22, 0x72, 0x61, 0x77, 0x22, + 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x73, 0x0a, 0x0a, 0x23, 0x23, 0x23, + 0x20, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x46, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x69, 0x6e, 0x67, 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x74, 0x79, 0x70, + 0x65, 0x20, 0x6f, 0x66, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, + 0x6e, 0x67, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, + 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, + 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, + 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x42, 0x79, 0x53, 0x4a, 0x6f, + 0x75, 0x74, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6b, 0x65, 0x65, 0x70, 0x20, + 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x74, 0x68, 0x6f, 0x73, 0x65, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x70, 0x61, 0x73, + 0x73, 0x65, 0x64, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, + 0x67, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x53, 0x4a, 0x2e, 0x6f, 0x75, + 0x74, 0x2e, 0x74, 0x61, 0x62, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, + 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, + 0x53, 0x63, 0x6f, 0x72, 0x65, 0x52, 0x61, 0x6e, 0x67, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3a, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, + 0x72, 0x61, 0x6e, 0x67, 0x65, 0x20, 0x62, 0x65, 0x6c, 0x6f, 0x77, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x75, + 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x0a, 0x6f, + 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x75, 0x6c, 0x74, + 0x69, 0x6d, 0x61, 0x70, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, + 0x75, 0x6d, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, + 0x20, 0x6c, 0x6f, 0x63, 0x69, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x69, 0x73, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, + 0x64, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x70, 0x20, 0x74, 0x6f, 0x2e, + 0x20, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, + 0x28, 0x61, 0x6c, 0x6c, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x6d, + 0x29, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x69, 0x66, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6d, 0x61, + 0x70, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x6e, 0x6f, 0x20, 0x6d, 0x6f, 0x72, + 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x69, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4f, 0x74, 0x68, + 0x65, 0x72, 0x77, 0x69, 0x73, 0x65, 0x20, 0x6e, 0x6f, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x77, 0x69, 0x6c, + 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x2c, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x63, 0x6f, + 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x22, 0x6d, 0x61, + 0x70, 0x70, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x6f, 0x6f, 0x20, + 0x6d, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x6f, 0x63, 0x69, 0x22, 0x20, 0x69, + 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x4c, 0x6f, 0x67, 0x2e, 0x66, 0x69, + 0x6e, 0x61, 0x6c, 0x2e, 0x6f, 0x75, 0x74, 0x20, 0x2e, 0x0a, 0x0a, 0x6f, + 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x69, 0x73, 0x6d, + 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, + 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x6f, 0x6e, 0x6c, 0x79, + 0x20, 0x69, 0x66, 0x20, 0x69, 0x74, 0x20, 0x68, 0x61, 0x73, 0x20, 0x6e, + 0x6f, 0x20, 0x6d, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x69, 0x73, 0x6d, 0x61, + 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x0a, 0x0a, + 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x69, 0x73, + 0x6d, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x76, 0x65, 0x72, 0x4c, 0x6d, + 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x33, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x3a, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x77, 0x69, 0x6c, 0x6c, + 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x6f, + 0x6e, 0x6c, 0x79, 0x20, 0x69, 0x66, 0x20, 0x69, 0x74, 0x73, 0x20, 0x72, + 0x61, 0x74, 0x69, 0x6f, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x69, 0x73, 0x6d, + 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x2a, 0x6d, + 0x61, 0x70, 0x70, 0x65, 0x64, 0x2a, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, + 0x68, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x65, 0x73, 0x73, 0x20, 0x74, 0x68, + 0x61, 0x6e, 0x20, 0x6f, 0x72, 0x20, 0x65, 0x71, 0x75, 0x61, 0x6c, 0x20, + 0x74, 0x6f, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, + 0x72, 0x4d, 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x76, + 0x65, 0x72, 0x52, 0x65, 0x61, 0x64, 0x4c, 0x6d, 0x61, 0x78, 0x20, 0x20, + 0x31, 0x2e, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x6c, + 0x3a, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x69, 0x66, 0x20, 0x69, + 0x74, 0x73, 0x20, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x20, 0x6f, 0x66, 0x20, + 0x6d, 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x74, + 0x6f, 0x20, 0x2a, 0x72, 0x65, 0x61, 0x64, 0x2a, 0x20, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x65, 0x73, 0x73, 0x20, + 0x74, 0x68, 0x61, 0x6e, 0x20, 0x6f, 0x72, 0x20, 0x65, 0x71, 0x75, 0x61, + 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x0a, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, + 0x6c, 0x74, 0x65, 0x72, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x4d, 0x69, 0x6e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3a, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x69, 0x66, 0x20, 0x69, + 0x74, 0x73, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x73, 0x20, + 0x68, 0x69, 0x67, 0x68, 0x65, 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20, + 0x6f, 0x72, 0x20, 0x65, 0x71, 0x75, 0x61, 0x6c, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x0a, + 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x53, 0x63, + 0x6f, 0x72, 0x65, 0x4d, 0x69, 0x6e, 0x4f, 0x76, 0x65, 0x72, 0x4c, 0x72, + 0x65, 0x61, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, + 0x36, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x3a, 0x20, + 0x73, 0x61, 0x6d, 0x65, 0x20, 0x61, 0x73, 0x20, 0x6f, 0x75, 0x74, 0x46, + 0x69, 0x6c, 0x74, 0x65, 0x72, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x4d, 0x69, + 0x6e, 0x2c, 0x20, 0x62, 0x75, 0x74, 0x20, 0x20, 0x6e, 0x6f, 0x72, 0x6d, + 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x28, 0x73, + 0x75, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x27, + 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x73, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, 0x2d, 0x65, 0x6e, 0x64, 0x20, + 0x72, 0x65, 0x61, 0x64, 0x73, 0x29, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, + 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6d, + 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x3a, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, + 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x69, 0x66, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, + 0x66, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x20, 0x62, 0x61, + 0x73, 0x65, 0x73, 0x20, 0x69, 0x73, 0x20, 0x68, 0x69, 0x67, 0x68, 0x65, + 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20, 0x6f, 0x72, 0x20, 0x65, 0x71, + 0x75, 0x61, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, + 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6d, + 0x69, 0x6e, 0x4f, 0x76, 0x65, 0x72, 0x4c, 0x72, 0x65, 0x61, 0x64, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x36, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x61, 0x6c, 0x3a, 0x20, 0x73, 0x61, 0x6d, 0x20, 0x61, + 0x73, 0x20, 0x6f, 0x75, 0x74, 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4d, + 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x62, 0x75, + 0x74, 0x20, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, + 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x28, 0x73, 0x75, 0x6d, + 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x27, 0x20, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, + 0x61, 0x69, 0x72, 0x65, 0x64, 0x2d, 0x65, 0x6e, 0x64, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x73, 0x29, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x46, 0x69, + 0x6c, 0x74, 0x65, 0x72, 0x49, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x4d, 0x6f, + 0x74, 0x69, 0x66, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, + 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x69, 0x72, + 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x73, 0x0a, 0x09, 0x09, 0x09, 0x09, + 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, + 0x6f, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x0a, + 0x09, 0x09, 0x09, 0x09, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x4e, 0x6f, + 0x6e, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x20, 0x6f, 0x75, + 0x74, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, + 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, + 0x63, 0x61, 0x6c, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, + 0x4e, 0x6f, 0x6e, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, + 0x55, 0x6e, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x20, + 0x6f, 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, + 0x6e, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x75, 0x6e, 0x61, 0x6e, 0x6e, 0x6f, + 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x20, 0x77, 0x68, 0x65, 0x6e, 0x20, 0x75, 0x73, 0x69, + 0x6e, 0x67, 0x20, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x65, 0x64, + 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x20, 0x6a, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, + 0x73, 0x65, 0x2e, 0x20, 0x54, 0x68, 0x65, 0x20, 0x61, 0x6e, 0x6e, 0x6f, + 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, + 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x6a, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, + 0x65, 0x20, 0x6b, 0x65, 0x70, 0x74, 0x2e, 0x0a, 0x0a, 0x6f, 0x75, 0x74, + 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x49, 0x6e, 0x74, 0x72, 0x6f, 0x6e, + 0x53, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x49, + 0x6e, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x53, + 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, + 0x72, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x49, + 0x6e, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x53, + 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x74, 0x68, + 0x61, 0x74, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x6a, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x69, + 0x6e, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, + 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, + 0x6f, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x0a, + 0x0a, 0x23, 0x23, 0x23, 0x20, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, + 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x53, + 0x70, 0x6c, 0x69, 0x63, 0x65, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x52, 0x65, 0x61, 0x64, 0x73, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x41, 0x6c, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x69, + 0x64, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x63, 0x6f, 0x6c, 0x6c, + 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x6f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x41, 0x6c, + 0x6c, 0x3a, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, + 0x2c, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x2d, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x6d, 0x61, 0x70, 0x70, + 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x55, 0x6e, 0x69, 0x71, + 0x75, 0x65, 0x3a, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x6c, 0x79, + 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x73, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x0a, 0x0a, 0x6f, 0x75, 0x74, + 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x4f, 0x76, 0x65, 0x72, + 0x68, 0x61, 0x6e, 0x67, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x30, 0x20, 0x20, 0x31, 0x32, 0x20, + 0x20, 0x31, 0x32, 0x20, 0x20, 0x31, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x34, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x73, 0x3a, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6f, + 0x76, 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, 0x20, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, + 0x65, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, + 0x6f, 0x6e, 0x20, 0x62, 0x6f, 0x74, 0x68, 0x20, 0x73, 0x69, 0x64, 0x65, + 0x73, 0x20, 0x66, 0x6f, 0x72, 0x3a, 0x20, 0x28, 0x31, 0x29, 0x20, 0x6e, + 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, + 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x73, 0x2c, 0x20, 0x28, 0x32, 0x29, + 0x20, 0x47, 0x54, 0x2f, 0x41, 0x47, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x43, + 0x54, 0x2f, 0x41, 0x43, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2c, 0x20, + 0x28, 0x33, 0x29, 0x20, 0x47, 0x43, 0x2f, 0x41, 0x47, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x43, 0x54, 0x2f, 0x47, 0x43, 0x20, 0x6d, 0x6f, 0x74, 0x69, + 0x66, 0x2c, 0x20, 0x28, 0x34, 0x29, 0x20, 0x41, 0x54, 0x2f, 0x41, 0x43, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x54, 0x2f, 0x41, 0x54, 0x20, 0x6d, + 0x6f, 0x74, 0x69, 0x66, 0x2e, 0x20, 0x2d, 0x31, 0x20, 0x6d, 0x65, 0x61, + 0x6e, 0x73, 0x20, 0x6e, 0x6f, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x6d, 0x6f, + 0x74, 0x69, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x70, 0x70, + 0x6c, 0x79, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, + 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x55, 0x6e, 0x69, 0x71, 0x75, + 0x65, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, + 0x20, 0x20, 0x20, 0x31, 0x20, 0x20, 0x20, 0x31, 0x20, 0x20, 0x20, 0x31, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x34, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x67, + 0x65, 0x72, 0x73, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, + 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x6c, 0x79, 0x20, 0x6d, 0x61, + 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x63, + 0x6f, 0x75, 0x6e, 0x74, 0x20, 0x70, 0x65, 0x72, 0x20, 0x6a, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x3a, 0x20, 0x28, + 0x31, 0x29, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, + 0x69, 0x63, 0x61, 0x6c, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x73, 0x2c, + 0x20, 0x28, 0x32, 0x29, 0x20, 0x47, 0x54, 0x2f, 0x41, 0x47, 0x20, 0x61, + 0x6e, 0x64, 0x20, 0x43, 0x54, 0x2f, 0x41, 0x43, 0x20, 0x6d, 0x6f, 0x74, + 0x69, 0x66, 0x2c, 0x20, 0x28, 0x33, 0x29, 0x20, 0x47, 0x43, 0x2f, 0x41, + 0x47, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x43, 0x54, 0x2f, 0x47, 0x43, 0x20, + 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2c, 0x20, 0x28, 0x34, 0x29, 0x20, 0x41, + 0x54, 0x2f, 0x41, 0x43, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x54, 0x2f, + 0x41, 0x54, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2e, 0x20, 0x2d, 0x31, + 0x20, 0x6d, 0x65, 0x61, 0x6e, 0x73, 0x20, 0x6e, 0x6f, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x61, + 0x74, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x20, 0x69, 0x66, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6f, + 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x43, 0x6f, + 0x75, 0x6e, 0x74, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x4d, 0x69, 0x6e, + 0x20, 0x4f, 0x52, 0x20, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x54, 0x6f, 0x74, 0x61, + 0x6c, 0x4d, 0x69, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x64, 0x69, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x73, 0x61, 0x74, 0x69, + 0x73, 0x66, 0x69, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x61, + 0x70, 0x70, 0x6c, 0x79, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x6e, 0x6f, + 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, + 0x6c, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x54, 0x6f, 0x74, + 0x61, 0x6c, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x20, + 0x20, 0x20, 0x31, 0x20, 0x20, 0x20, 0x31, 0x20, 0x20, 0x20, 0x31, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x34, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x65, + 0x72, 0x73, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x20, 0x28, 0x6d, 0x75, 0x6c, 0x74, 0x69, + 0x2d, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x2b, 0x75, 0x6e, 0x69, + 0x71, 0x75, 0x65, 0x29, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x63, 0x6f, + 0x75, 0x6e, 0x74, 0x20, 0x70, 0x65, 0x72, 0x20, 0x6a, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x3a, 0x20, 0x28, 0x31, + 0x29, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, + 0x63, 0x61, 0x6c, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x73, 0x2c, 0x20, + 0x28, 0x32, 0x29, 0x20, 0x47, 0x54, 0x2f, 0x41, 0x47, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x43, 0x54, 0x2f, 0x41, 0x43, 0x20, 0x6d, 0x6f, 0x74, 0x69, + 0x66, 0x2c, 0x20, 0x28, 0x33, 0x29, 0x20, 0x47, 0x43, 0x2f, 0x41, 0x47, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x43, 0x54, 0x2f, 0x47, 0x43, 0x20, 0x6d, + 0x6f, 0x74, 0x69, 0x66, 0x2c, 0x20, 0x28, 0x34, 0x29, 0x20, 0x41, 0x54, + 0x2f, 0x41, 0x43, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x54, 0x2f, 0x41, + 0x54, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2e, 0x20, 0x2d, 0x31, 0x20, + 0x6d, 0x65, 0x61, 0x6e, 0x73, 0x20, 0x6e, 0x6f, 0x20, 0x6f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x61, 0x74, + 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, + 0x20, 0x61, 0x72, 0x65, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, + 0x69, 0x66, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6f, 0x75, + 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x75, + 0x6e, 0x74, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x4d, 0x69, 0x6e, 0x20, + 0x4f, 0x52, 0x20, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, + 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x54, 0x6f, 0x74, 0x61, 0x6c, + 0x4d, 0x69, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x64, 0x69, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x73, 0x61, 0x74, 0x69, 0x73, + 0x66, 0x69, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x70, + 0x70, 0x6c, 0x79, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x6e, 0x6f, 0x74, + 0x61, 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, 0x53, 0x4a, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x44, 0x69, 0x73, 0x74, 0x54, 0x6f, 0x4f, 0x74, 0x68, + 0x65, 0x72, 0x53, 0x4a, 0x6d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x30, 0x20, 0x20, 0x30, 0x20, 0x20, 0x20, 0x35, 0x20, 0x20, 0x20, + 0x31, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x34, 0x20, 0x69, 0x6e, 0x74, + 0x65, 0x67, 0x65, 0x72, 0x73, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x69, + 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, + 0x64, 0x20, 0x64, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x20, 0x74, + 0x6f, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20, 0x6a, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x27, 0x20, 0x64, 0x6f, 0x6e, 0x6f, 0x72, + 0x2f, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x6f, 0x72, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, + 0x6f, 0x74, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x20, 0x74, 0x6f, 0x20, + 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x6f, 0x75, 0x74, + 0x53, 0x4a, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x49, 0x6e, 0x74, 0x72, + 0x6f, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x73, 0x52, 0x65, 0x61, 0x64, 0x4e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x35, 0x30, 0x30, 0x30, + 0x30, 0x20, 0x31, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, 0x32, 0x30, 0x30, + 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x20, 0x69, 0x6e, + 0x74, 0x65, 0x67, 0x65, 0x72, 0x73, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, + 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x67, 0x61, 0x70, 0x20, 0x61, + 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6a, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x73, 0x75, 0x70, + 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x31, 0x2c, + 0x32, 0x2c, 0x33, 0x2c, 0x2c, 0x2c, 0x4e, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x2e, + 0x65, 0x2e, 0x20, 0x62, 0x79, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, + 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x62, 0x79, + 0x20, 0x31, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x63, 0x61, 0x6e, 0x20, + 0x68, 0x61, 0x76, 0x65, 0x20, 0x67, 0x61, 0x70, 0x73, 0x20, 0x3c, 0x3d, + 0x35, 0x30, 0x30, 0x30, 0x30, 0x62, 0x2c, 0x20, 0x62, 0x79, 0x20, 0x32, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x3a, 0x20, 0x3c, 0x3d, 0x31, 0x30, + 0x30, 0x30, 0x30, 0x30, 0x62, 0x2c, 0x20, 0x62, 0x79, 0x20, 0x33, 0x20, + 0x72, 0x65, 0x61, 0x64, 0x73, 0x3a, 0x20, 0x3c, 0x3d, 0x32, 0x30, 0x30, + 0x30, 0x30, 0x30, 0x2e, 0x20, 0x62, 0x79, 0x20, 0x3e, 0x3d, 0x34, 0x20, + 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x67, 0x61, + 0x70, 0x20, 0x3c, 0x3d, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x49, 0x6e, 0x74, + 0x72, 0x6f, 0x6e, 0x4d, 0x61, 0x78, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, + 0x61, 0x70, 0x70, 0x6c, 0x79, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x6e, + 0x6f, 0x74, 0x61, 0x74, 0x65, 0x64, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x53, 0x63, + 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x0a, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x47, + 0x61, 0x70, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x73, 0x70, + 0x6c, 0x69, 0x63, 0x65, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x28, 0x69, + 0x6e, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64, 0x65, 0x6e, 0x74, 0x20, 0x6f, + 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x6d, 0x6f, 0x74, + 0x69, 0x66, 0x29, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x47, 0x61, + 0x70, 0x4e, 0x6f, 0x6e, 0x63, 0x61, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x38, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x6e, 0x6f, + 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, 0x20, + 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x20, 0x28, 0x69, 0x6e, 0x20, 0x61, 0x64, 0x64, + 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x63, 0x6f, + 0x72, 0x65, 0x47, 0x61, 0x70, 0x29, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, + 0x65, 0x47, 0x61, 0x70, 0x47, 0x43, 0x41, 0x47, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2d, 0x34, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x47, 0x43, 0x2f, 0x41, + 0x47, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x43, 0x54, 0x2f, 0x47, 0x43, 0x20, + 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x20, 0x28, 0x69, 0x6e, 0x20, 0x61, 0x64, 0x64, + 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x63, 0x6f, + 0x72, 0x65, 0x47, 0x61, 0x70, 0x29, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, + 0x65, 0x47, 0x61, 0x70, 0x41, 0x54, 0x41, 0x43, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2d, 0x38, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x41, 0x54, 0x2f, 0x41, + 0x43, 0x20, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x54, 0x2f, 0x41, 0x54, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x65, + 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x20, 0x28, 0x69, 0x6e, 0x20, 0x61, + 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x73, + 0x63, 0x6f, 0x72, 0x65, 0x47, 0x61, 0x70, 0x29, 0x0a, 0x0a, 0x73, 0x63, + 0x6f, 0x72, 0x65, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x69, 0x63, 0x4c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x4c, 0x6f, 0x67, 0x32, 0x73, 0x63, 0x61, 0x6c, + 0x65, 0x20, 0x20, 0x20, 0x2d, 0x30, 0x2e, 0x32, 0x35, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x20, 0x73, 0x63, 0x6f, 0x72, + 0x65, 0x20, 0x6c, 0x6f, 0x67, 0x61, 0x72, 0x69, 0x74, 0x68, 0x6d, 0x69, + 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x64, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x69, + 0x63, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x3a, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x47, 0x65, 0x6e, 0x6f, + 0x6d, 0x69, 0x63, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x4c, 0x6f, 0x67, + 0x32, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x2a, 0x6c, 0x6f, 0x67, 0x32, 0x28, + 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x69, 0x63, 0x4c, 0x65, 0x6e, 0x67, 0x74, + 0x68, 0x29, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x44, 0x65, 0x6c, + 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x32, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x44, 0x65, 0x6c, 0x42, + 0x61, 0x73, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x32, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x65, 0x78, 0x74, 0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x65, + 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x70, 0x65, 0x72, 0x20, 0x62, 0x61, + 0x73, 0x65, 0x20, 0x28, 0x69, 0x6e, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, + 0x44, 0x65, 0x6c, 0x4f, 0x70, 0x65, 0x6e, 0x29, 0x0a, 0x0a, 0x73, 0x63, + 0x6f, 0x72, 0x65, 0x49, 0x6e, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2d, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, 0x70, 0x65, 0x6e, + 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x0a, 0x0a, 0x73, 0x63, + 0x6f, 0x72, 0x65, 0x49, 0x6e, 0x73, 0x42, 0x61, 0x73, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2d, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x78, 0x74, 0x65, + 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x20, 0x70, 0x65, 0x72, 0x20, 0x62, 0x61, 0x73, 0x65, 0x20, 0x28, + 0x69, 0x6e, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x74, 0x6f, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x49, 0x6e, 0x73, 0x4f, + 0x70, 0x65, 0x6e, 0x29, 0x0a, 0x0a, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x53, + 0x74, 0x69, 0x74, 0x63, 0x68, 0x53, 0x4a, 0x73, 0x68, 0x69, 0x66, 0x74, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, + 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x72, 0x65, 0x64, 0x75, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x73, + 0x65, 0x61, 0x72, 0x63, 0x68, 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x53, 0x4a, 0x20, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x69, + 0x65, 0x73, 0x20, 0x69, 0x6e, 0x74, 0x68, 0x65, 0x20, 0x73, 0x74, 0x69, + 0x74, 0x63, 0x68, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x74, 0x65, 0x70, 0x0a, + 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x6d, + 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x53, 0x65, 0x65, + 0x64, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, + 0x61, 0x72, 0x63, 0x68, 0x53, 0x74, 0x61, 0x72, 0x74, 0x4c, 0x6d, 0x61, + 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x35, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3e, 0x30, 0x3a, 0x20, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x73, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, 0x20, 0x73, + 0x74, 0x61, 0x72, 0x74, 0x20, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x20, 0x74, + 0x68, 0x72, 0x6f, 0x75, 0x67, 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x20, 0x2d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x20, 0x69, 0x73, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x20, + 0x6e, 0x6f, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x65, 0x72, 0x20, 0x74, 0x68, + 0x61, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x0a, 0x0a, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x61, 0x72, 0x63, + 0x68, 0x53, 0x74, 0x61, 0x72, 0x74, 0x4c, 0x6d, 0x61, 0x78, 0x4f, 0x76, + 0x65, 0x72, 0x4c, 0x72, 0x65, 0x61, 0x64, 0x20, 0x20, 0x20, 0x20, 0x31, + 0x2e, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x3a, + 0x20, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x53, + 0x74, 0x61, 0x72, 0x74, 0x4c, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x6f, 0x72, + 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x28, + 0x73, 0x75, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, + 0x27, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x73, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x70, 0x61, 0x69, 0x72, 0x65, 0x64, 0x2d, 0x65, 0x6e, 0x64, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x29, 0x0a, 0x0a, 0x73, 0x65, 0x65, + 0x64, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x4c, 0x6d, 0x61, 0x78, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x64, 0x65, 0x66, 0x69, + 0x6e, 0x65, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x78, 0x69, + 0x6d, 0x75, 0x6d, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, + 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x65, 0x64, 0x73, 0x2c, + 0x20, 0x69, 0x66, 0x20, 0x3d, 0x30, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x73, + 0x65, 0x65, 0x64, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x69, 0x73, + 0x20, 0x69, 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x65, 0x0a, 0x0a, 0x73, + 0x65, 0x65, 0x64, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x4e, + 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x30, + 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, + 0x3a, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x70, 0x69, 0x65, 0x63, 0x65, + 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x6d, 0x61, 0x70, 0x20, 0x66, + 0x65, 0x77, 0x65, 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x61, 0x72, 0x65, + 0x20, 0x75, 0x74, 0x69, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x69, 0x6e, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x74, 0x69, 0x74, 0x63, 0x68, 0x69, + 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x64, 0x75, 0x72, 0x65, + 0x0a, 0x0a, 0x73, 0x65, 0x65, 0x64, 0x50, 0x65, 0x72, 0x52, 0x65, 0x61, + 0x64, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x73, 0x65, 0x65, 0x64, 0x73, 0x20, + 0x70, 0x65, 0x72, 0x20, 0x72, 0x65, 0x61, 0x64, 0x0a, 0x0a, 0x73, 0x65, + 0x65, 0x64, 0x50, 0x65, 0x72, 0x57, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x4e, + 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x35, 0x30, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, + 0x78, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, + 0x73, 0x65, 0x65, 0x64, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x77, 0x69, + 0x6e, 0x64, 0x6f, 0x77, 0x0a, 0x0a, 0x73, 0x65, 0x65, 0x64, 0x4e, 0x6f, + 0x6e, 0x65, 0x4c, 0x6f, 0x63, 0x69, 0x50, 0x65, 0x72, 0x57, 0x69, 0x6e, + 0x64, 0x6f, 0x77, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, + 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6f, + 0x6e, 0x65, 0x20, 0x73, 0x65, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x63, 0x69, + 0x20, 0x70, 0x65, 0x72, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x0a, + 0x0a, 0x73, 0x65, 0x65, 0x64, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x4d, 0x69, + 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x20, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x73, 0x65, 0x65, 0x64, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x63, 0x65, 0x73, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x62, 0x79, + 0x20, 0x4e, 0x73, 0x20, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, + 0x67, 0x61, 0x70, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x49, 0x6e, + 0x74, 0x72, 0x6f, 0x6e, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x31, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3a, + 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x69, 0x63, 0x20, 0x67, 0x61, 0x70, + 0x20, 0x69, 0x73, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, + 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x69, 0x66, + 0x20, 0x69, 0x74, 0x73, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, + 0x3d, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x49, 0x6e, 0x74, 0x72, 0x6f, 0x6e, + 0x4d, 0x69, 0x6e, 0x2c, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x77, 0x69, + 0x73, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69, 0x73, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x69, 0x64, 0x65, 0x72, 0x65, 0x64, 0x20, 0x44, 0x65, 0x6c, 0x65, + 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x49, + 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x4d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, + 0x20, 0x69, 0x66, 0x20, 0x30, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x69, + 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x77, + 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x64, 0x65, 0x74, 0x65, 0x72, + 0x6d, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x28, 0x32, 0x5e, + 0x77, 0x69, 0x6e, 0x42, 0x69, 0x6e, 0x4e, 0x62, 0x69, 0x74, 0x73, 0x29, + 0x2a, 0x77, 0x69, 0x6e, 0x41, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x44, 0x69, + 0x73, 0x74, 0x4e, 0x62, 0x69, 0x6e, 0x73, 0x0a, 0x0a, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x4d, 0x61, 0x74, 0x65, 0x73, 0x47, 0x61, 0x70, 0x4d, 0x61, + 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, + 0x75, 0x6d, 0x20, 0x67, 0x61, 0x70, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, + 0x65, 0x6e, 0x20, 0x74, 0x77, 0x6f, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, + 0x2c, 0x20, 0x69, 0x66, 0x20, 0x30, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x20, + 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x20, 0x67, 0x61, 0x70, 0x20, 0x77, + 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x64, 0x65, 0x74, 0x65, 0x72, + 0x6d, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x28, 0x32, 0x5e, + 0x77, 0x69, 0x6e, 0x42, 0x69, 0x6e, 0x4e, 0x62, 0x69, 0x74, 0x73, 0x29, + 0x2a, 0x77, 0x69, 0x6e, 0x41, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x44, 0x69, + 0x73, 0x74, 0x4e, 0x62, 0x69, 0x6e, 0x73, 0x0a, 0x0a, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x53, 0x4a, 0x6f, 0x76, 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, + 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x35, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, + 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6f, 0x76, + 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, 0x20, 0x28, 0x69, 0x2e, 0x65, 0x2e, + 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x64, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, + 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x53, 0x4a, 0x73, 0x74, 0x69, 0x74, + 0x63, 0x68, 0x4d, 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6d, + 0x61, 0x78, 0x20, 0x20, 0x20, 0x30, 0x20, 0x2d, 0x31, 0x20, 0x30, 0x20, + 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x34, 0x2a, 0x69, 0x6e, 0x74, 0x3e, + 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x69, + 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x73, 0x74, 0x69, 0x74, 0x63, 0x68, 0x69, 0x6e, 0x67, 0x20, 0x6f, + 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x28, + 0x2d, 0x31, 0x3a, 0x20, 0x6e, 0x6f, 0x20, 0x6c, 0x69, 0x6d, 0x69, 0x74, + 0x29, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x31, 0x29, 0x20, 0x6e, + 0x6f, 0x6e, 0x2d, 0x63, 0x61, 0x6e, 0x6f, 0x6e, 0x69, 0x63, 0x61, 0x6c, + 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x73, 0x2c, 0x20, 0x28, 0x32, 0x29, + 0x20, 0x47, 0x54, 0x2f, 0x41, 0x47, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x43, + 0x54, 0x2f, 0x41, 0x43, 0x20, 0x6d, 0x6f, 0x74, 0x69, 0x66, 0x2c, 0x20, + 0x28, 0x33, 0x29, 0x20, 0x47, 0x43, 0x2f, 0x41, 0x47, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x43, 0x54, 0x2f, 0x47, 0x43, 0x20, 0x6d, 0x6f, 0x74, 0x69, + 0x66, 0x2c, 0x20, 0x28, 0x34, 0x29, 0x20, 0x41, 0x54, 0x2f, 0x41, 0x43, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x54, 0x2f, 0x41, 0x54, 0x20, 0x6d, + 0x6f, 0x74, 0x69, 0x66, 0x2e, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x53, 0x4a, 0x44, 0x42, 0x6f, 0x76, 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, + 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, + 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6f, 0x76, 0x65, 0x72, + 0x68, 0x61, 0x6e, 0x67, 0x20, 0x28, 0x69, 0x2e, 0x65, 0x2e, 0x20, 0x62, + 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, 0x65, 0x64, + 0x20, 0x28, 0x73, 0x6a, 0x64, 0x62, 0x29, 0x20, 0x73, 0x70, 0x6c, 0x69, + 0x63, 0x65, 0x64, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x73, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x53, 0x70, 0x6c, + 0x69, 0x63, 0x65, 0x64, 0x4d, 0x61, 0x74, 0x65, 0x4d, 0x61, 0x70, 0x4c, + 0x6d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, + 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x20, + 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, 0x74, + 0x68, 0x61, 0x74, 0x20, 0x69, 0x73, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x63, + 0x65, 0x64, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x53, 0x70, 0x6c, + 0x69, 0x63, 0x65, 0x64, 0x4d, 0x61, 0x74, 0x65, 0x4d, 0x61, 0x70, 0x4c, + 0x6d, 0x69, 0x6e, 0x4f, 0x76, 0x65, 0x72, 0x4c, 0x6d, 0x61, 0x74, 0x65, + 0x20, 0x30, 0x2e, 0x36, 0x36, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x61, 0x6c, 0x3e, 0x30, 0x3a, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x53, + 0x70, 0x6c, 0x69, 0x63, 0x65, 0x64, 0x4d, 0x61, 0x74, 0x65, 0x4d, 0x61, + 0x70, 0x4c, 0x6d, 0x69, 0x6e, 0x20, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, + 0x69, 0x7a, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x74, 0x65, + 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x0a, 0x0a, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x57, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x73, 0x50, 0x65, 0x72, + 0x52, 0x65, 0x61, 0x64, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x77, 0x69, 0x6e, 0x64, + 0x6f, 0x77, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x54, 0x72, 0x61, 0x6e, 0x73, + 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x50, 0x65, 0x72, 0x57, 0x69, 0x6e, + 0x64, 0x6f, 0x77, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, + 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x77, + 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x50, + 0x65, 0x72, 0x52, 0x65, 0x61, 0x64, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x30, 0x30, 0x30, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x69, 0x66, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, + 0x65, 0x6e, 0x74, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, + 0x72, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x45, 0x6e, 0x64, 0x73, + 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x74, 0x79, 0x70, + 0x65, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x65, 0x6e, + 0x64, 0x73, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, + 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x20, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x2d, 0x63, 0x6c, + 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, + 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x45, 0x6e, 0x64, 0x54, 0x6f, 0x45, 0x6e, 0x64, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x2d, 0x74, + 0x6f, 0x2d, 0x65, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x20, 0x64, 0x6f, + 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x2d, 0x63, 0x6c, + 0x69, 0x70, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x35, 0x70, 0x4f, + 0x66, 0x52, 0x65, 0x61, 0x64, 0x31, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x79, 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, + 0x64, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x35, + 0x70, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x31, 0x2c, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x6f, 0x74, 0x68, 0x65, + 0x72, 0x20, 0x65, 0x6e, 0x64, 0x73, 0x3a, 0x20, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x35, 0x70, 0x4f, 0x66, 0x52, 0x65, + 0x61, 0x64, 0x73, 0x31, 0x32, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x75, + 0x6c, 0x6c, 0x79, 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x20, 0x6f, + 0x6e, 0x6c, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x35, 0x70, 0x20, 0x6f, + 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x6f, 0x74, 0x68, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x31, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x32, 0x2c, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x6f, 0x74, 0x68, 0x65, + 0x72, 0x20, 0x65, 0x6e, 0x64, 0x73, 0x3a, 0x20, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x0a, + 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x45, 0x6e, 0x64, 0x73, 0x50, 0x72, + 0x6f, 0x74, 0x72, 0x75, 0x64, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x20, 0x20, 0x20, 0x20, 0x43, 0x6f, 0x6e, 0x63, 0x6f, 0x72, + 0x64, 0x61, 0x6e, 0x74, 0x50, 0x61, 0x69, 0x72, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x6e, 0x74, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, + 0x3a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x6c, + 0x6f, 0x77, 0x20, 0x70, 0x72, 0x6f, 0x74, 0x72, 0x75, 0x73, 0x69, 0x6f, + 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, + 0x6e, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x73, 0x2c, 0x20, 0x69, 0x2e, 0x65, + 0x2e, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x28, 0x65, 0x6e, 0x64, + 0x29, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x2b, 0x73, 0x74, + 0x72, 0x61, 0x6e, 0x64, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x20, 0x64, 0x6f, + 0x77, 0x6e, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x6f, 0x66, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x28, 0x65, + 0x6e, 0x64, 0x29, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x2d, + 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x3a, 0x20, 0x69, 0x6e, + 0x74, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, + 0x74, 0x72, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x61, 0x73, 0x65, + 0x73, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x6e, + 0x64, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x43, 0x6f, 0x6e, 0x63, 0x6f, 0x72, 0x64, 0x61, 0x6e, 0x74, 0x50, 0x61, + 0x69, 0x72, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x70, 0x6f, 0x72, + 0x74, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, + 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x7a, 0x65, + 0x72, 0x6f, 0x20, 0x70, 0x72, 0x6f, 0x74, 0x72, 0x75, 0x73, 0x69, 0x6f, + 0x6e, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6f, 0x6e, 0x63, 0x6f, 0x72, 0x64, + 0x61, 0x6e, 0x74, 0x20, 0x70, 0x61, 0x69, 0x72, 0x73, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x44, 0x69, 0x73, 0x63, 0x6f, 0x72, + 0x64, 0x61, 0x6e, 0x74, 0x50, 0x61, 0x69, 0x72, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x72, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x67, + 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, + 0x6e, 0x6f, 0x6e, 0x2d, 0x7a, 0x65, 0x72, 0x6f, 0x20, 0x70, 0x72, 0x6f, + 0x74, 0x72, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x73, 0x20, 0x64, + 0x69, 0x73, 0x63, 0x6f, 0x72, 0x64, 0x61, 0x6e, 0x74, 0x20, 0x70, 0x61, + 0x69, 0x72, 0x73, 0x0a, 0x0a, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x53, 0x6f, + 0x66, 0x74, 0x43, 0x6c, 0x69, 0x70, 0x41, 0x74, 0x52, 0x65, 0x66, 0x65, + 0x72, 0x65, 0x6e, 0x63, 0x65, 0x45, 0x6e, 0x64, 0x73, 0x20, 0x20, 0x20, + 0x20, 0x59, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x2d, 0x63, 0x6c, 0x69, 0x70, + 0x70, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x70, + 0x61, 0x73, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x20, + 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x68, 0x72, 0x6f, 0x6d, + 0x6f, 0x73, 0x6f, 0x6d, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x59, 0x65, 0x73, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, + 0x6c, 0x6c, 0x6f, 0x77, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4e, 0x6f, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x70, 0x72, 0x6f, + 0x68, 0x69, 0x62, 0x69, 0x74, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, + 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x74, + 0x69, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x79, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x43, 0x75, 0x66, 0x66, 0x6c, 0x69, 0x6e, 0x6b, 0x73, 0x0a, 0x0a, + 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x49, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x69, + 0x6f, 0x6e, 0x46, 0x6c, 0x75, 0x73, 0x68, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x68, 0x6f, 0x77, 0x20, 0x74, 0x6f, 0x20, + 0x66, 0x6c, 0x75, 0x73, 0x68, 0x20, 0x61, 0x6d, 0x62, 0x69, 0x67, 0x75, + 0x6f, 0x75, 0x73, 0x20, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x61, + 0x72, 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x66, 0x6c, 0x75, 0x73, 0x68, + 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x52, 0x69, 0x67, 0x68, 0x74, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x66, 0x6c, 0x75, 0x73, 0x68, 0x65, + 0x64, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x69, 0x67, + 0x68, 0x74, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x50, 0x61, 0x69, 0x72, + 0x65, 0x64, 0x2d, 0x45, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, + 0x0a, 0x70, 0x65, 0x4f, 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x4e, 0x62, + 0x61, 0x73, 0x65, 0x73, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, + 0x75, 0x6d, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, + 0x20, 0x6f, 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x20, 0x62, 0x61, 0x73, + 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x72, 0x69, 0x67, 0x67, 0x65, + 0x72, 0x20, 0x6d, 0x61, 0x74, 0x65, 0x73, 0x20, 0x6d, 0x65, 0x72, 0x67, + 0x69, 0x6e, 0x67, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x72, 0x65, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x0a, 0x70, 0x65, 0x4f, + 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x4d, 0x4d, 0x70, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x2e, 0x30, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x61, 0x6c, 0x2c, 0x20, 0x3e, 0x3d, 0x30, 0x20, 0x26, 0x20, 0x3c, 0x31, + 0x3a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, + 0x6d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x6f, 0x72, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, + 0x65, 0x64, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73, 0x20, 0x69, 0x6e, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x6c, 0x61, 0x70, 0x20, + 0x61, 0x72, 0x65, 0x61, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x57, 0x69, + 0x6e, 0x64, 0x6f, 0x77, 0x73, 0x2c, 0x20, 0x41, 0x6e, 0x63, 0x68, 0x6f, + 0x72, 0x73, 0x2c, 0x20, 0x42, 0x69, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x0a, + 0x0a, 0x77, 0x69, 0x6e, 0x41, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x4d, 0x75, + 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x35, 0x30, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, + 0x61, 0x78, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, + 0x20, 0x6c, 0x6f, 0x63, 0x69, 0x20, 0x61, 0x6e, 0x63, 0x68, 0x6f, 0x72, + 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, + 0x64, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x70, 0x20, 0x74, 0x6f, 0x0a, + 0x0a, 0x77, 0x69, 0x6e, 0x42, 0x69, 0x6e, 0x4e, 0x62, 0x69, 0x74, 0x73, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x36, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x3d, + 0x6c, 0x6f, 0x67, 0x32, 0x28, 0x77, 0x69, 0x6e, 0x42, 0x69, 0x6e, 0x29, + 0x2c, 0x20, 0x77, 0x68, 0x65, 0x72, 0x65, 0x20, 0x77, 0x69, 0x6e, 0x42, + 0x69, 0x6e, 0x20, 0x69, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x69, + 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x69, + 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x77, 0x69, + 0x6e, 0x64, 0x6f, 0x77, 0x73, 0x2f, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, + 0x72, 0x69, 0x6e, 0x67, 0x2c, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x77, + 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x6f, + 0x63, 0x63, 0x75, 0x70, 0x79, 0x20, 0x61, 0x6e, 0x20, 0x69, 0x6e, 0x74, + 0x65, 0x67, 0x65, 0x72, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, + 0x6f, 0x66, 0x20, 0x62, 0x69, 0x6e, 0x73, 0x2e, 0x0a, 0x0a, 0x77, 0x69, + 0x6e, 0x41, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x44, 0x69, 0x73, 0x74, 0x4e, + 0x62, 0x69, 0x6e, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x39, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x20, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x62, 0x69, 0x6e, + 0x73, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x74, 0x77, + 0x6f, 0x20, 0x61, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x73, 0x20, 0x74, 0x68, + 0x61, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x73, 0x20, 0x61, 0x67, + 0x67, 0x72, 0x65, 0x67, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, 0x66, + 0x20, 0x61, 0x6e, 0x63, 0x68, 0x6f, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x74, + 0x6f, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, + 0x0a, 0x0a, 0x77, 0x69, 0x6e, 0x46, 0x6c, 0x61, 0x6e, 0x6b, 0x4e, 0x62, + 0x69, 0x6e, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x34, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6c, + 0x6f, 0x67, 0x32, 0x28, 0x77, 0x69, 0x6e, 0x46, 0x6c, 0x61, 0x6e, 0x6b, + 0x29, 0x2c, 0x20, 0x77, 0x68, 0x65, 0x72, 0x65, 0x20, 0x77, 0x69, 0x6e, + 0x20, 0x46, 0x6c, 0x61, 0x6e, 0x6b, 0x20, 0x69, 0x73, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x72, + 0x69, 0x67, 0x68, 0x74, 0x20, 0x66, 0x6c, 0x61, 0x6e, 0x6b, 0x69, 0x6e, + 0x67, 0x20, 0x72, 0x65, 0x67, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, + 0x77, 0x0a, 0x0a, 0x77, 0x69, 0x6e, 0x52, 0x65, 0x61, 0x64, 0x43, 0x6f, + 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x52, 0x65, 0x6c, 0x61, 0x74, 0x69, + 0x76, 0x65, 0x4d, 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, + 0x2e, 0x35, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x6c, 0x3e, + 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x72, 0x65, 0x6c, 0x61, 0x74, 0x69, 0x76, 0x65, 0x20, 0x63, 0x6f, 0x76, + 0x65, 0x72, 0x61, 0x67, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x63, 0x65, 0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, + 0x65, 0x64, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x61, 0x20, 0x77, 0x69, 0x6e, + 0x64, 0x6f, 0x77, 0x2c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x53, 0x54, 0x41, + 0x52, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x6c, 0x67, 0x6f, 0x72, 0x69, + 0x74, 0x68, 0x6d, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x2e, 0x0a, 0x0a, 0x77, + 0x69, 0x6e, 0x52, 0x65, 0x61, 0x64, 0x43, 0x6f, 0x76, 0x65, 0x72, 0x61, + 0x67, 0x65, 0x42, 0x61, 0x73, 0x65, 0x73, 0x4d, 0x69, 0x6e, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, + 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x62, + 0x61, 0x73, 0x65, 0x73, 0x20, 0x63, 0x6f, 0x76, 0x65, 0x72, 0x65, 0x64, + 0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x65, 0x64, + 0x73, 0x20, 0x69, 0x6e, 0x20, 0x61, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, + 0x77, 0x20, 0x2c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x53, 0x54, 0x41, 0x52, + 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x6c, 0x67, 0x6f, 0x72, 0x69, 0x74, + 0x68, 0x6d, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x2e, 0x0a, 0x0a, 0x23, 0x23, + 0x23, 0x20, 0x43, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x41, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x63, 0x68, + 0x69, 0x6d, 0x4f, 0x75, 0x74, 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, + 0x29, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x63, + 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x6f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4a, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x43, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x2e, + 0x6f, 0x75, 0x74, 0x2e, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, + 0x65, 0x53, 0x41, 0x4d, 0x6f, 0x6c, 0x64, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x20, + 0x53, 0x41, 0x4d, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x70, + 0x61, 0x72, 0x61, 0x74, 0x65, 0x20, 0x43, 0x68, 0x69, 0x6d, 0x65, 0x72, + 0x69, 0x63, 0x2e, 0x6f, 0x75, 0x74, 0x2e, 0x73, 0x61, 0x6d, 0x20, 0x66, + 0x69, 0x6c, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x57, 0x69, 0x74, 0x68, + 0x69, 0x6e, 0x42, 0x41, 0x4d, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x69, + 0x6e, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x65, 0x64, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x66, 0x69, 0x6c, + 0x65, 0x73, 0x20, 0x28, 0x41, 0x6c, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x2e, + 0x2a, 0x2e, 0x62, 0x61, 0x6d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x57, + 0x69, 0x74, 0x68, 0x69, 0x6e, 0x42, 0x41, 0x4d, 0x20, 0x48, 0x61, 0x72, + 0x64, 0x43, 0x6c, 0x69, 0x70, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x28, + 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x29, 0x20, 0x68, 0x61, 0x72, + 0x64, 0x2d, 0x63, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x69, + 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x49, 0x47, 0x41, 0x52, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, + 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, + 0x63, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, + 0x20, 0x28, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x69, 0x66, 0x20, + 0x6e, 0x6f, 0x20, 0x32, 0x6e, 0x64, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x20, + 0x69, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x57, 0x69, 0x74, 0x68, 0x69, 0x6e, 0x42, 0x41, + 0x4d, 0x20, 0x53, 0x6f, 0x66, 0x74, 0x43, 0x6c, 0x69, 0x70, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x2d, 0x63, 0x6c, 0x69, + 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x43, 0x49, 0x47, 0x41, 0x52, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, + 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, + 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x0a, 0x63, 0x68, 0x69, + 0x6d, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x4d, 0x69, 0x6e, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x3d, + 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x63, 0x68, 0x69, + 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, + 0x74, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x20, 0x69, 0x66, + 0x20, 0x3d, 0x3d, 0x30, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x68, 0x69, + 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x4d, + 0x69, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, + 0x6d, 0x75, 0x6d, 0x20, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x20, 0x28, 0x73, + 0x75, 0x6d, 0x6d, 0x65, 0x64, 0x29, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, + 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x68, 0x69, 0x6d, + 0x65, 0x72, 0x69, 0x63, 0x20, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, + 0x73, 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x53, 0x63, 0x6f, 0x72, 0x65, + 0x44, 0x72, 0x6f, 0x70, 0x4d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x30, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x61, + 0x78, 0x20, 0x64, 0x72, 0x6f, 0x70, 0x20, 0x28, 0x64, 0x69, 0x66, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x29, 0x20, 0x6f, 0x66, 0x20, 0x63, + 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x73, 0x63, 0x6f, 0x72, + 0x65, 0x20, 0x28, 0x74, 0x68, 0x65, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x6f, + 0x66, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x20, + 0x61, 0x6c, 0x6c, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, + 0x20, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x29, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, + 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x0a, 0x0a, 0x63, 0x68, 0x69, + 0x6d, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x53, 0x65, 0x70, 0x61, 0x72, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, + 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x20, + 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x28, + 0x73, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x29, 0x20, + 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x62, 0x65, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, + 0x63, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x61, 0x6e, 0x64, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x6f, 0x6e, 0x65, + 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x53, 0x63, 0x6f, 0x72, 0x65, 0x4a, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x4e, 0x6f, 0x6e, 0x47, 0x54, + 0x41, 0x47, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, + 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, + 0x47, 0x54, 0x2f, 0x41, 0x47, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, + 0x69, 0x63, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x0a, + 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x4a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x4f, 0x76, 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, 0x4d, 0x69, 0x6e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x69, + 0x6d, 0x75, 0x6d, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x68, 0x61, 0x6e, 0x67, + 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, + 0x72, 0x69, 0x63, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, + 0x74, 0x52, 0x65, 0x61, 0x64, 0x47, 0x61, 0x70, 0x4d, 0x61, 0x78, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, + 0x6d, 0x75, 0x6d, 0x20, 0x67, 0x61, 0x70, 0x20, 0x69, 0x6e, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x73, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x63, 0x65, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, + 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x73, 0x65, + 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, + 0x46, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x62, 0x61, 0x6e, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x69, 0x63, 0x4e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, + 0x29, 0x3a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, + 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x73, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, + 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x62, 0x61, 0x6e, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x69, + 0x63, 0x4e, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x4e, 0x73, 0x20, 0x61, 0x72, + 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, + 0x64, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, + 0x6f, 0x6d, 0x65, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, + 0x20, 0x61, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x6a, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x4d, + 0x61, 0x69, 0x6e, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x4d, 0x75, + 0x6c, 0x74, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, + 0x3e, 0x3d, 0x31, 0x3a, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, + 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6d, + 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, + 0x6e, 0x74, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, + 0x63, 0x20, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x20, 0x3d, + 0x31, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x70, 0x72, 0x6f, 0x68, 0x69, + 0x62, 0x69, 0x74, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, + 0x70, 0x69, 0x6e, 0x67, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x73, 0x65, + 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x0a, 0x0a, 0x63, 0x68, 0x69, + 0x6d, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x4e, 0x6d, 0x61, + 0x78, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x6d, + 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x20, 0x6f, 0x66, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, + 0x63, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x61, 0x6c, 0x69, 0x67, + 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x30, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x75, 0x73, 0x65, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x63, 0x68, 0x69, 0x6d, + 0x65, 0x72, 0x69, 0x63, 0x20, 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x6f, 0x6e, 0x6c, + 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x65, 0x64, + 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x61, 0x6c, 0x69, 0x67, + 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, + 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x61, 0x70, 0x53, 0x63, 0x6f, 0x72, + 0x65, 0x52, 0x61, 0x6e, 0x67, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, + 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x63, + 0x6f, 0x72, 0x65, 0x20, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x20, 0x66, 0x6f, + 0x72, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x2d, 0x6d, 0x61, 0x70, 0x70, + 0x69, 0x6e, 0x67, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x61, 0x73, + 0x20, 0x62, 0x65, 0x6c, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, + 0x65, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, + 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x2e, 0x20, 0x4f, 0x6e, 0x6c, 0x79, + 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, + 0x2d, 0x2d, 0x63, 0x68, 0x69, 0x6d, 0x4d, 0x75, 0x6c, 0x74, 0x69, 0x6d, + 0x61, 0x70, 0x4e, 0x6d, 0x61, 0x78, 0x20, 0x3e, 0x20, 0x31, 0x0a, 0x0a, + 0x63, 0x68, 0x69, 0x6d, 0x4e, 0x6f, 0x6e, 0x63, 0x68, 0x69, 0x6d, 0x53, + 0x63, 0x6f, 0x72, 0x65, 0x44, 0x72, 0x6f, 0x70, 0x4d, 0x69, 0x6e, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x30, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x3d, 0x30, 0x3a, 0x20, 0x74, + 0x6f, 0x20, 0x74, 0x72, 0x69, 0x67, 0x67, 0x65, 0x72, 0x20, 0x63, 0x68, + 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x20, 0x64, 0x65, 0x74, 0x65, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x72, + 0x6f, 0x70, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x65, + 0x73, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x63, 0x68, 0x69, 0x6d, 0x65, + 0x72, 0x69, 0x63, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, + 0x74, 0x20, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x72, 0x65, 0x73, 0x70, 0x65, 0x63, 0x74, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x20, 0x68, 0x61, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x62, + 0x65, 0x20, 0x73, 0x6d, 0x61, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x74, 0x68, + 0x61, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x0a, 0x0a, 0x63, 0x68, 0x69, 0x6d, 0x4f, 0x75, 0x74, 0x4a, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6f, + 0x72, 0x6d, 0x61, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x79, 0x70, + 0x65, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x68, + 0x69, 0x6d, 0x65, 0x72, 0x69, 0x63, 0x2e, 0x6f, 0x75, 0x74, 0x2e, 0x6a, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x65, 0x6e, + 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2f, 0x68, 0x65, 0x61, 0x64, + 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x31, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x65, 0x6e, + 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x61, 0x74, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x20, 0x63, 0x6f, 0x6d, 0x6d, + 0x61, 0x6e, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x4e, 0x72, 0x65, 0x61, 0x64, 0x73, 0x3a, 0x20, 0x74, 0x6f, 0x74, + 0x61, 0x6c, 0x2c, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x2c, 0x20, + 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x0a, 0x0a, 0x23, 0x23, 0x23, 0x20, 0x51, + 0x75, 0x61, 0x6e, 0x74, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x41, 0x6e, 0x6e, 0x6f, 0x74, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x73, 0x0a, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x4d, 0x6f, + 0x64, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, + 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x71, + 0x75, 0x61, 0x6e, 0x74, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x65, 0x64, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2d, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x54, 0x72, + 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x6d, 0x65, 0x53, + 0x41, 0x4d, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x20, 0x53, 0x41, 0x4d, 0x2f, 0x42, 0x41, 0x4d, 0x20, 0x61, 0x6c, + 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x74, 0x6f, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x6d, + 0x65, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x61, 0x20, 0x73, 0x65, 0x70, + 0x61, 0x72, 0x61, 0x74, 0x65, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x47, 0x65, 0x6e, 0x65, 0x43, 0x6f, 0x75, 0x6e, 0x74, + 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, + 0x70, 0x65, 0x72, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x0a, 0x0a, 0x71, 0x75, + 0x61, 0x6e, 0x74, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, + 0x74, 0x6f, 0x6d, 0x65, 0x42, 0x41, 0x4d, 0x63, 0x6f, 0x6d, 0x70, 0x72, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x31, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x74, 0x3a, 0x20, 0x2d, 0x32, 0x20, 0x74, 0x6f, 0x20, 0x31, + 0x30, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, + 0x74, 0x6f, 0x6d, 0x65, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x63, 0x6f, 0x6d, + 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x65, 0x76, + 0x65, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x32, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x42, 0x41, 0x4d, 0x20, 0x6f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x31, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x20, 0x28, 0x36, 0x3f, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f, 0x6d, + 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x30, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6d, 0x61, 0x78, + 0x69, 0x6d, 0x75, 0x6d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x0a, 0x0a, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x54, + 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x6d, 0x65, + 0x42, 0x61, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x49, 0x6e, + 0x64, 0x65, 0x6c, 0x53, 0x6f, 0x66, 0x74, 0x63, 0x6c, 0x69, 0x70, 0x53, + 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x70, 0x72, 0x6f, + 0x68, 0x69, 0x62, 0x69, 0x74, 0x20, 0x76, 0x61, 0x72, 0x69, 0x6f, 0x75, + 0x73, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x74, 0x79, 0x70, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x49, 0x6e, 0x64, + 0x65, 0x6c, 0x53, 0x6f, 0x66, 0x74, 0x63, 0x6c, 0x69, 0x70, 0x53, 0x69, + 0x6e, 0x67, 0x6c, 0x65, 0x65, 0x6e, 0x64, 0x20, 0x20, 0x2e, 0x2e, 0x2e, + 0x20, 0x70, 0x72, 0x6f, 0x68, 0x69, 0x62, 0x69, 0x74, 0x20, 0x69, 0x6e, + 0x64, 0x65, 0x6c, 0x73, 0x2c, 0x20, 0x73, 0x6f, 0x66, 0x74, 0x20, 0x63, + 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x6e, 0x64, 0x20, + 0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x2d, 0x65, 0x6e, 0x64, 0x20, 0x61, + 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x2d, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x74, 0x69, 0x62, 0x6c, 0x65, 0x20, 0x77, + 0x69, 0x74, 0x68, 0x20, 0x52, 0x53, 0x45, 0x4d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x53, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x65, 0x6e, 0x64, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x70, 0x72, 0x6f, 0x68, 0x69, 0x62, 0x69, + 0x74, 0x20, 0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x2d, 0x65, 0x6e, 0x64, + 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x0a, + 0x0a, 0x23, 0x23, 0x23, 0x20, 0x32, 0x2d, 0x70, 0x61, 0x73, 0x73, 0x20, + 0x4d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x0a, 0x74, 0x77, 0x6f, 0x70, + 0x61, 0x73, 0x73, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x32, 0x2d, 0x70, 0x61, 0x73, 0x73, 0x20, + 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x6d, 0x6f, 0x64, 0x65, + 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x31, 0x2d, + 0x70, 0x61, 0x73, 0x73, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x42, 0x61, 0x73, 0x69, 0x63, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x62, 0x61, 0x73, + 0x69, 0x63, 0x20, 0x32, 0x2d, 0x70, 0x61, 0x73, 0x73, 0x20, 0x6d, 0x61, + 0x70, 0x70, 0x69, 0x6e, 0x67, 0x2c, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, + 0x61, 0x6c, 0x6c, 0x20, 0x31, 0x73, 0x74, 0x20, 0x70, 0x61, 0x73, 0x73, + 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x69, + 0x6e, 0x73, 0x65, 0x72, 0x74, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x6f, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, 0x65, 0x20, + 0x69, 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, 0x20, 0x6f, 0x6e, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x66, 0x6c, 0x79, 0x0a, 0x0a, 0x74, 0x77, 0x6f, 0x70, + 0x61, 0x73, 0x73, 0x31, 0x72, 0x65, 0x61, 0x64, 0x73, 0x4e, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2d, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x70, 0x72, 0x6f, 0x63, 0x65, + 0x73, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x31, + 0x73, 0x74, 0x20, 0x73, 0x74, 0x65, 0x70, 0x2e, 0x20, 0x55, 0x73, 0x65, + 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x20, + 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x28, 0x6f, 0x72, 0x20, 0x64, + 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x2d, 0x31, 0x29, 0x20, 0x74, + 0x6f, 0x20, 0x6d, 0x61, 0x70, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, + 0x69, 0x72, 0x73, 0x74, 0x20, 0x73, 0x74, 0x65, 0x70, 0x2e, 0x0a, 0x0a, + 0x0a, 0x23, 0x23, 0x23, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x70, 0x61, + 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, 0x77, 0x61, 0x73, + 0x70, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x4d, 0x6f, 0x64, 0x65, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x61, + 0x6c, 0x6c, 0x65, 0x6c, 0x65, 0x2d, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, + 0x69, 0x63, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, + 0x72, 0x65, 0x2d, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, + 0x6f, 0x72, 0x69, 0x67, 0x69, 0x6e, 0x61, 0x6c, 0x20, 0x57, 0x41, 0x53, + 0x50, 0x20, 0x6d, 0x61, 0x70, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, + 0x79, 0x20, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, + 0x62, 0x79, 0x20, 0x42, 0x72, 0x79, 0x63, 0x65, 0x20, 0x76, 0x61, 0x6e, + 0x20, 0x64, 0x65, 0x20, 0x47, 0x65, 0x69, 0x6a, 0x6e, 0x2c, 0x20, 0x47, + 0x72, 0x61, 0x68, 0x61, 0x6d, 0x20, 0x4d, 0x63, 0x56, 0x69, 0x63, 0x6b, + 0x65, 0x72, 0x2c, 0x20, 0x59, 0x6f, 0x61, 0x76, 0x20, 0x47, 0x69, 0x6c, + 0x61, 0x64, 0x20, 0x26, 0x20, 0x4a, 0x6f, 0x6e, 0x61, 0x74, 0x68, 0x61, + 0x6e, 0x20, 0x4b, 0x20, 0x50, 0x72, 0x69, 0x74, 0x63, 0x68, 0x61, 0x72, + 0x64, 0x2e, 0x20, 0x50, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x20, 0x63, 0x69, + 0x74, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x72, 0x69, 0x67, 0x69, + 0x6e, 0x61, 0x6c, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x70, 0x61, 0x70, + 0x65, 0x72, 0x3a, 0x20, 0x4e, 0x61, 0x74, 0x75, 0x72, 0x65, 0x20, 0x4d, + 0x65, 0x74, 0x68, 0x6f, 0x64, 0x73, 0x20, 0x31, 0x32, 0x2c, 0x20, 0x31, + 0x30, 0x36, 0x31, 0xe2, 0x80, 0x93, 0x31, 0x30, 0x36, 0x33, 0x20, 0x28, + 0x32, 0x30, 0x31, 0x35, 0x29, 0x2c, 0x20, 0x68, 0x74, 0x74, 0x70, 0x73, + 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, 0x6e, 0x61, 0x74, 0x75, 0x72, + 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, 0x72, 0x74, 0x69, 0x63, 0x6c, + 0x65, 0x73, 0x2f, 0x6e, 0x6d, 0x65, 0x74, 0x68, 0x2e, 0x33, 0x35, 0x38, + 0x32, 0x20, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x41, 0x4d, 0x74, + 0x61, 0x67, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x61, 0x64, 0x64, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x74, 0x61, 0x67, + 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x61, 0x6c, 0x69, + 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, + 0x20, 0x70, 0x61, 0x73, 0x73, 0x20, 0x57, 0x41, 0x53, 0x50, 0x20, 0x66, + 0x69, 0x6c, 0x74, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x23, 0x23, + 0x23, 0x20, 0x53, 0x54, 0x41, 0x52, 0x73, 0x6f, 0x6c, 0x6f, 0x20, 0x28, + 0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x20, + 0x52, 0x4e, 0x41, 0x2d, 0x73, 0x65, 0x71, 0x29, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, + 0x54, 0x79, 0x70, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x4e, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, + 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x74, 0x79, 0x70, 0x65, + 0x20, 0x6f, 0x66, 0x20, 0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x2d, 0x63, + 0x65, 0x6c, 0x6c, 0x20, 0x52, 0x4e, 0x41, 0x2d, 0x73, 0x65, 0x71, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x44, 0x72, 0x6f, 0x70, 0x6c, 0x65, 0x74, 0x20, + 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x63, 0x65, + 0x6c, 0x6c, 0x20, 0x62, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x61, + 0x6e, 0x64, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x55, 0x4d, 0x49, 0x20, 0x62, + 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, + 0x61, 0x64, 0x32, 0x2c, 0x20, 0x65, 0x2e, 0x67, 0x2e, 0x20, 0x44, 0x72, + 0x6f, 0x70, 0x2d, 0x73, 0x65, 0x71, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x31, + 0x30, 0x58, 0x20, 0x43, 0x68, 0x72, 0x6f, 0x6d, 0x69, 0x75, 0x6d, 0x0a, + 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x43, 0x42, 0x77, 0x68, 0x69, 0x74, 0x65, + 0x6c, 0x69, 0x73, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, + 0x77, 0x69, 0x74, 0x68, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65, 0x6c, 0x69, + 0x73, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x20, 0x62, + 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x73, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, + 0x6f, 0x43, 0x42, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, + 0x3a, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x20, 0x62, 0x61, 0x72, 0x63, 0x6f, + 0x64, 0x65, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x62, 0x61, 0x73, + 0x65, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x43, 0x42, 0x6c, 0x65, 0x6e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x36, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x63, 0x65, 0x6c, + 0x6c, 0x20, 0x62, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x55, 0x4d, + 0x49, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x37, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, + 0x55, 0x4d, 0x49, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x62, 0x61, + 0x73, 0x65, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x55, 0x4d, 0x49, 0x6c, + 0x65, 0x6e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3e, 0x30, 0x3a, 0x20, 0x55, 0x4d, + 0x49, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x0a, 0x0a, 0x73, 0x6f, + 0x6c, 0x6f, 0x42, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x61, + 0x64, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x3a, + 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x62, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x72, + 0x65, 0x61, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x65, 0x71, 0x75, 0x61, 0x6c, 0x20, 0x74, 0x6f, + 0x20, 0x73, 0x75, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x73, 0x6f, 0x6c, 0x6f, + 0x43, 0x42, 0x6c, 0x65, 0x6e, 0x2b, 0x73, 0x6f, 0x6c, 0x6f, 0x55, 0x4d, + 0x49, 0x6c, 0x65, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x64, 0x65, 0x66, 0x69, + 0x6e, 0x65, 0x64, 0x2c, 0x20, 0x64, 0x6f, 0x20, 0x6e, 0x6f, 0x74, 0x20, + 0x63, 0x68, 0x65, 0x63, 0x6b, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x53, + 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x46, + 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, + 0x64, 0x65, 0x64, 0x6e, 0x65, 0x73, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x6f, 0x6c, 0x6f, 0x20, 0x6c, 0x69, 0x62, 0x72, + 0x61, 0x72, 0x69, 0x65, 0x73, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x55, + 0x6e, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x6e, 0x6f, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, + 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x46, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x61, + 0x64, 0x20, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x61, 0x6d, + 0x65, 0x20, 0x61, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x72, 0x69, + 0x67, 0x69, 0x6e, 0x61, 0x6c, 0x20, 0x52, 0x4e, 0x41, 0x20, 0x6d, 0x6f, + 0x6c, 0x65, 0x63, 0x75, 0x6c, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x52, + 0x65, 0x76, 0x65, 0x72, 0x73, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x73, 0x74, 0x72, 0x61, + 0x6e, 0x64, 0x20, 0x6f, 0x70, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x65, 0x20, + 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6f, 0x72, 0x69, 0x67, 0x69, + 0x6e, 0x61, 0x6c, 0x20, 0x52, 0x4e, 0x41, 0x20, 0x6d, 0x6f, 0x6c, 0x65, + 0x63, 0x75, 0x6c, 0x65, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x46, 0x65, + 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x47, 0x65, + 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, + 0x67, 0x28, 0x73, 0x29, 0x3a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x6f, 0x6d, + 0x69, 0x63, 0x20, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x55, 0x4d, 0x49, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x73, + 0x20, 0x70, 0x65, 0x72, 0x20, 0x43, 0x65, 0x6c, 0x6c, 0x20, 0x42, 0x61, + 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x61, 0x72, 0x65, 0x20, 0x63, 0x6f, + 0x6c, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x47, 0x65, 0x6e, 0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x67, 0x65, 0x6e, 0x65, + 0x73, 0x3a, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x6d, 0x61, 0x74, + 0x63, 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x20, + 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x53, 0x4a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x73, + 0x70, 0x6c, 0x69, 0x63, 0x65, 0x20, 0x6a, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x3a, 0x20, 0x72, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x65, + 0x64, 0x20, 0x69, 0x6e, 0x20, 0x53, 0x4a, 0x2e, 0x6f, 0x75, 0x74, 0x2e, + 0x74, 0x61, 0x62, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x47, 0x65, 0x6e, 0x65, + 0x46, 0x75, 0x6c, 0x6c, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x67, 0x65, 0x6e, + 0x65, 0x73, 0x3a, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x20, 0x61, 0x6c, + 0x6c, 0x20, 0x72, 0x65, 0x61, 0x64, 0x73, 0x20, 0x6f, 0x76, 0x65, 0x72, + 0x6c, 0x61, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x20, 0x67, 0x65, 0x6e, 0x65, + 0x73, 0x27, 0x20, 0x65, 0x78, 0x6f, 0x6e, 0x73, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x69, 0x6e, 0x74, 0x72, 0x6f, 0x6e, 0x73, 0x0a, 0x0a, 0x73, 0x6f, + 0x6c, 0x6f, 0x55, 0x4d, 0x49, 0x64, 0x65, 0x64, 0x75, 0x70, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x4d, 0x4d, 0x5f, 0x41, 0x6c, 0x6c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x29, 0x3a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x55, 0x4d, + 0x49, 0x20, 0x64, 0x65, 0x64, 0x75, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, + 0x69, 0x6e, 0x67, 0x29, 0x20, 0x61, 0x6c, 0x67, 0x6f, 0x72, 0x69, 0x74, + 0x68, 0x6d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x4d, 0x4d, 0x5f, 0x41, + 0x6c, 0x6c, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x55, + 0x4d, 0x49, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x31, 0x20, 0x6d, + 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x64, 0x69, 0x73, 0x74, + 0x61, 0x6e, 0x63, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x65, 0x61, 0x63, 0x68, + 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20, 0x61, 0x72, 0x65, 0x20, 0x63, + 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x28, 0x69, 0x2e, + 0x65, 0x2e, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x6f, + 0x6e, 0x63, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x4d, 0x4d, + 0x5f, 0x44, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x6f, 0x6c, + 0x6c, 0x6f, 0x77, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x22, 0x64, 0x69, + 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x22, 0x20, 0x6d, + 0x65, 0x74, 0x68, 0x6f, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x55, 0x4d, 0x49, 0x2d, 0x74, 0x6f, 0x6f, 0x6c, 0x73, + 0x20, 0x62, 0x79, 0x20, 0x53, 0x6d, 0x69, 0x74, 0x68, 0x2c, 0x20, 0x48, + 0x65, 0x67, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x53, 0x75, 0x64, + 0x62, 0x65, 0x72, 0x79, 0x20, 0x28, 0x47, 0x65, 0x6e, 0x6f, 0x6d, 0x65, + 0x20, 0x52, 0x65, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, 0x20, 0x32, 0x30, + 0x31, 0x37, 0x29, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x4d, 0x4d, + 0x5f, 0x4e, 0x6f, 0x74, 0x43, 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, + 0x64, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x55, + 0x4d, 0x49, 0x73, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x31, 0x20, 0x6d, + 0x69, 0x73, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x64, 0x69, 0x73, 0x74, + 0x61, 0x6e, 0x63, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x74, 0x68, 0x65, + 0x72, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x63, + 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x20, 0x28, 0x69, 0x2e, + 0x65, 0x2e, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, + 0x65, 0x64, 0x29, 0x0a, 0x0a, 0x73, 0x6f, 0x6c, 0x6f, 0x4f, 0x75, 0x74, + 0x46, 0x69, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x73, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x53, 0x6f, 0x6c, + 0x6f, 0x2e, 0x6f, 0x75, 0x74, 0x2f, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x73, + 0x2e, 0x74, 0x73, 0x76, 0x20, 0x62, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, + 0x73, 0x2e, 0x74, 0x73, 0x76, 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, 0x78, + 0x2e, 0x6d, 0x74, 0x78, 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, 0x78, 0x53, + 0x4a, 0x2e, 0x6d, 0x74, 0x78, 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, 0x78, + 0x47, 0x65, 0x6e, 0x65, 0x46, 0x75, 0x6c, 0x6c, 0x2e, 0x6d, 0x74, 0x78, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, + 0x73, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x53, 0x54, 0x41, 0x52, + 0x73, 0x6f, 0x6c, 0x6f, 0x20, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x31, 0x73, 0x74, 0x20, 0x77, 0x6f, 0x72, 0x64, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x66, 0x69, 0x6c, 0x65, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x6e, 0x64, 0x20, 0x77, 0x6f, 0x72, + 0x64, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x67, 0x65, 0x6e, + 0x65, 0x20, 0x49, 0x44, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x72, 0x64, 0x20, + 0x77, 0x6f, 0x72, 0x64, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, + 0x62, 0x61, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x73, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x63, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x34, 0x74, + 0x68, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, + 0x2e, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x2f, 0x47, 0x65, 0x6e, 0x65, 0x20, + 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x73, 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, + 0x78, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x35, 0x74, 0x68, 0x20, 0x77, 0x6f, + 0x72, 0x64, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x20, 0x63, 0x65, + 0x6c, 0x6c, 0x2f, 0x53, 0x4a, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x73, + 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, 0x78, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x36, 0x74, 0x68, 0x20, 0x77, 0x6f, 0x72, 0x64, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x2e, 0x2e, 0x20, 0x63, 0x65, 0x6c, 0x6c, 0x2f, 0x47, 0x65, 0x6e, + 0x65, 0x46, 0x75, 0x6c, 0x6c, 0x20, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x73, + 0x20, 0x6d, 0x61, 0x74, 0x72, 0x69, 0x78, 0x0a, 0x0a +}; +unsigned int parametersDefault_len = 38385; diff --git a/star-sys/STAR/source/readLoad.cpp b/star-sys/STAR/source/readLoad.cpp new file mode 100644 index 0000000..122d483 --- /dev/null +++ b/star-sys/STAR/source/readLoad.cpp @@ -0,0 +1,169 @@ +#include "readLoad.h" +#include "ErrorWarning.h" + +int readLoad(istream& readInStream, const Parameters& P, uint iMate, uint& Lread, uint& LreadOriginal, char* readName, char* Seq, char* SeqNum, char* Qual, char* QualNum, uint &clip3pNtotal, uint &clip5pNtotal, uint &clip3pAdapterN, uint &/*iReadAll*/, uint &/*readFilesIndex*/, char &/*readFilter*/, string &readNameExtra){ + //load one read from a stream + int readFileType=0; + +// readInStream.getline(readName,DEF_readNameLengthMax); //extract name + + if (readInStream.peek()!='@' && readInStream.peek()!='>') return -1; //end of the stream + + readName[0]=0;//clear char array + readInStream >> readName; //TODO check that it does not overflow the array + if (strlen(readName)>=DEF_readNameLengthMax-1) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in reads input: read name is too long:" << readInStream.gcount()<<"\n"; + errOut << "Read Name="<logMain, EXIT_CODE_INPUT_FILES, P); + }; + + //printf("read name is: %s\n", readName); + + //readInStream >> iReadAll >> readFilter >> readFilesIndex; //extract read number + + getline(readInStream, readNameExtra); + if (!readNameExtra.empty()) { + size_t n1=readNameExtra.find_first_not_of(" \t"); + if (n1!=std::string::npos) { + readNameExtra=readNameExtra.substr(n1); + } else { + readNameExtra=""; + }; + }; + +// readInStream.ignore(DEF_readNameSeqLengthMax,'\n');//ignore the resit of the line - just in case + + readInStream.getline(Seq,DEF_readSeqLengthMax+1); //extract sequence + //printf("seq is: %s\n", Seq); + + Lread=0; + for (int ii=0; ii=32) { + Seq[Lread]=Seq[ii]; + ++Lread; + }; + }; + + if (Lread<1) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in reads input: short read sequence line: " << Lread <<"\n"; + errOut << "Read Name="<logMain, EXIT_CODE_INPUT_FILES, P); + }; + if (Lread>DEF_readSeqLengthMax) { + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in reads input: Lread>=" << Lread << " while DEF_readSeqLengthMax=" << DEF_readSeqLengthMax <<"\n"; + errOut << "Read Name="<logMain, EXIT_CODE_INPUT_FILES, P); + }; + +// //was trying to read multi-line +// char nextChar='A'; +// Lread=0; +// while (nextChar!='@' && nextChar!='>' && nextChar!='+' && nextChar!=' ' && nextChar!='\n' && !readInStream.eof()) {//read multi-line fasta +// readInStream.getline(Seq+Lread,DEF_readSeqLengthMax+1); //extract sequence +// Lread+=(uint) readInStream.gcount() - 1; //count chars in the sequence line, but do not read yet +// nextChar=readInStream.peek(); +// }; +// if (Lread>DEF_readSeqLengthMax) { +// ostringstream errOut; +// errOut << "EXITING because of FATAL ERROR in reads input: Lread>=" << Lread << " while DEF_readSeqLengthMax=" << DEF_readSeqLengthMax <<"\n"; +// errOut << "Read Name="<logMain, EXIT_CODE_INPUT_FILES, P); +// }; +// LreadOriginal=Lread; + LreadOriginal=Lread; + if ( Lread>(P.clip5pNbases[iMate]+P.clip3pNbases[iMate]) ) { + Lread=Lread-(P.clip5pNbases[iMate]+P.clip3pNbases[iMate]); + } else { + Lread=0; + }; + convertNucleotidesToNumbersRemoveControls(Seq+P.clip5pNbases[iMate],SeqNum,Lread); + + //clip the adapter + if (P.clip3pAdapterSeq.at(iMate).length()>0) { + clip3pAdapterN = Lread-localSearch(SeqNum,Lread,P.clip3pAdapterSeqNum[iMate],P.clip3pAdapterSeq.at(iMate).length(),P.clip3pAdapterMMp[iMate]); + Lread = Lread>clip3pAdapterN ? Lread-clip3pAdapterN : 0; + } else { + clip3pAdapterN = 0; + }; + + //final read length, trim 3p after the adapter was clipped + if (Lread>P.clip3pAfterAdapterNbases[iMate]) { + Lread =Lread - P.clip3pAfterAdapterNbases[iMate]; + } else { + Lread=0; + }; + + clip3pNtotal=P.clip3pNbases[iMate] + clip3pAdapterN + P.clip3pAfterAdapterNbases[iMate]; + clip5pNtotal=P.clip5pNbases[iMate]; + + if (readName[0]=='@') {//fastq format, read qualities + readFileType=2; + readInStream.ignore(DEF_readNameLengthMax,'\n'); //extract header line + readInStream.getline(Qual,DEF_readSeqLengthMax);//read qualities + if ((uint) readInStream.gcount() != LreadOriginal+1) {//inconsistent read sequence and quality + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR in reads input: quality string length is not equal to sequence length\n"; + errOut << readName<<"\n"; + errOut << Seq <<"\n"; + errOut << Qual <<"\n"; + errOut << "SOLUTION: fix your fastq file\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + if (P.outQSconversionAdd!=0) { + for (uint ii=0;ii126) { + qs=126; + }; + Qual[ii]=qs; + }; + }; + + } else if (readName[0]=='>') {//fasta format, assign Qtop to all qualities + readFileType=1; + for (uint ii=0;ii : "<< readName<<"\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P); + }; + + for (uint ii=0;ii P.QasciiSubtract) ? (Qual[ii+P.clip5pNbases[iMate]] - P.QasciiSubtract) : 0; //substract QasciiSubtract +// QualNum[ii]=P.QSconv[(int) QualNum[ii]]; +// QualNum[ii]=min(QualNum[ii], P.Qtop);//cut QSs at the Qtop +// // if (QualNum[ii]==2) QualNum[ii]=P.Qtop; +// if (SeqNum[ii]>3) QualNum[ii]=0; //QS=0 for Ns +// Qual1[1][Lread-ii-1]=QualNum[ii]; //reverse +// }; + + + //trim read name + for (uint ii=0; ii + T sum1D(T* a, uint N) { + T s=0; + for (uint ii=0;ii int funCompareNumbers (const void *a, const void *b) { + numT va= *((numT*) a); + numT vb= *((numT*) b); + + if (va>vb) { + return 1; + } else if (va==vb) { + return 0; + } else { + return -1; + }; +}; + +template int funCompareNumbersReverse (const void *a, const void *b) { + numT va= *((numT*) a); + numT vb= *((numT*) b); + + if (va>vb) { + return -1; + } else if (va==vb) { + return 0; + } else { + return 1; + }; +}; + + +inline int funCompareUint1 (const void *a, const void *b) { + uint va= *((uint*) a); + uint vb= *((uint*) b); + + if (va>vb) { + return 1; + } else if (va==vb) { + return 0; + } else { + return -1; + }; +}; + +inline int funCompareUint2 (const void *a, const void *b) { + uint va= *((uint*) a); + uint vb= *((uint*) b); + uint va1=*(((uint*) a)+1); + uint vb1=*(((uint*) b)+1); + + if (va>vb) { + return 1; + } else if (va==vb && va1>vb1) { + return 1; + } else if (va==vb && va1==vb1) { + return 0; + } else { + return -1; + }; +}; + +template +inline int funCompareArrays (const void *a, const void *b) { + arrayType* va= (arrayType*) a; + arrayType* vb= (arrayType*) b; + + for (int ii=0;iivb[ii]) { + return 1; + } else if (va[ii] +inline int funCompareTypeShift (const void *a, const void *b) { + Type va= *( ((Type*) a)+Shift ); + Type vb= *( ((Type*) b)+Shift ); + + if (va>vb) { + return 1; + } else if (va==vb) { + return 0; + } else { + return -1; + }; + +}; + +inline int splitString(const std::string &s, char delim, std::vector &elems) { + std::stringstream ss(s); + std::string item; + int maxL=0; + elems.clear(); + while (std::getline(ss, item, delim)) { + maxL=max(maxL, (int)item.size()); + elems.push_back(item); + }; + return maxL;//returns mzx string size +}; + +template +inline uint32 binarySearch1(argType x, argType *X, uint32 N) { + //binary search in the sorted list + //check the boundaries first + if (x>X[N-1] || xi1+1) {//binary search + i3=(i1+i2)/2; + if (X[i3]>x) { + i2=i3; + } else { + i1=i3; + }; + }; + + while (i1 +inline int32 binarySearch1a(argType x, argType *X, int32 N) { + //binary search in the sorted list + //check the boundaries first + + if (x>X[N-1]) { + return N-1; + } else if (xi1+1) {//binary search + i3=(i1+i2)/2; + if (X[i3]>x) { + i2=i3; + } else { + i1=i3; + }; + }; + + while (i1 +inline int32 binarySearch1b(argType x, argType *X, int32 N) +{ + //binary search in the sorted list + //check the boundaries first + //1b returns the first X element that is >= x + //X are all distinct + //if x>X[N-1], -1 is returned + + if (x>X[N-1]) { + return -1; + } else if (x<=X[0]) { + return 0; + }; + + int32 i1=0, i2=N-1, i3=N/2; + while (i2>i1+1) {//binary search + i3=(i1+i2)/2; + if (X[i3]>=x) { + i2=i3; + } else { + i1=i3; + }; + }; + + return i2; +}; + +template +inline int64 binarySearchExact(argType x, const argType *X, uint64 N) { + //binary search in the sorted list + //check the boundaries first + //returns -1 if no match found + //if X are not all distinct, no guarantee which element is returned + + if (x>X[N-1] || xi1+1) {//binary search + i3=(i1+i2)/2; + if (X[i3]>=x) { + i2=i3; + } else { + i1=i3; + }; + }; + + if (x==X[i2]) { + return i2; + } else if (x==X[i1]) { + return i1; + } else { + return -1; + }; +}; + + +#endif diff --git a/star-sys/STAR/source/signalFromBAM.cpp b/star-sys/STAR/source/signalFromBAM.cpp new file mode 100644 index 0000000..cdf9483 --- /dev/null +++ b/star-sys/STAR/source/signalFromBAM.cpp @@ -0,0 +1,209 @@ +#include "signalFromBAM.h" +#include +#include + +void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) { + + bam1_t *bamA; + bamA=bam_init1(); + + double nMult=0, nUniq=0; + + if (P.outWigFlags.norm==1) {//count reads in the BAM file + BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); + bam_hdr_t *bamHeader=bam_hdr_read(bamIn); + while ( true ) {//until the end of file + int bamBytes1=bam_read1(bamIn, bamA); + if (bamBytes1<0) break; //end of file + if (bamA->core.tid<0) continue; //unmapped read +// if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references + if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references + + uint8_t* aNHp=bam_aux_get(bamA,"NH"); + if (aNHp!=NULL) { + uint32_t aNH=bam_aux2i(aNHp); + if (aNH==1) {//unique mappers + ++nUniq; + } else if (aNH>1) { + nMult+=1.0/aNH; + }; + }; + }; + bgzf_close(bamIn); + }; + + BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); + bam_hdr_t *bamHeader=bam_hdr_read(bamIn); + + int sigN=P.outWigFlags.strand ? 4 : 2; + + double *normFactor=new double[sigN]; + + ofstream **sigOutAll=new ofstream* [sigN]; + + string* sigOutFileName=new string[sigN]; + sigOutFileName[0]=sigFileName+".Unique.str1.out"; + sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out"; + if (P.outWigFlags.strand) { + sigOutFileName[2]=sigFileName+".Unique.str2.out"; + sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out"; + }; + + for (int ii=0; iicore.tid!=iChr || bamBytes1<0) { + //output to file + if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads + for (int is=0;istarget_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning + }; + prevSig=newSig; + }; + } else if (P.outWigFlags.format==1){//wiggle + if (newSig!=0) { + *sigOutAll[is] <core.tid; + if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) { + iChr=-999; + continue; //reference does not match required references + }; + + chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0 + delete [] sigAll; + sigAll= new double[sigN*chrLen]; + memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen); + }; + +// uint32_t nCigar =(bamA->core.flag<<16)>>16; +// uint32_t mapFlag=bamA->core.flag>>16; +// uint32_t mapQ=(bamA->core.flag<<16)>>24; + + #define BAM_CIGAR_OperationShift 4 + #define BAM_CIGAR_LengthBits 28 + #define BAM_CIGAR_M 0 + #define BAM_CIGAR_I 1 + #define BAM_CIGAR_D 2 + #define BAM_CIGAR_N 3 + #define BAM_CIGAR_S 4 + #define BAM_CIGAR_H 5 + #define BAM_CIGAR_P 6 + #define BAM_CIGAR_EQ 7 + #define BAM_CIGAR_X 8 + + //by default, alignments marked as duplicate are not processed + if ( (bamA->core.flag & 0x400) > 0 ) continue; + + //NH attribute + uint8_t* aNHp=bam_aux_get(bamA,"NH"); + uint32_t aNH; + if (aNHp==NULL) { + aNH=1; //no NH tag: assume NH=1 + //continue; //do not process lines without NH field + } else { + aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag + }; + if (aNH==0) continue; //do not process lines without NH=0 + uint32_t aG=bamA->core.pos; + uint32_t iStrand=0; + if (P.outWigFlags.strand) {//strand for stranded data from SAM flag + iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/- + }; + if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE + if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate + if (iStrand==0) { + if (aNH==1) {//unique mappers + sigAll[aG*sigN+0+2*iStrand]++; + }; + sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci + continue; //record only the first position + }; + }; + + uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname); + + for (uint32_t ic=0; iccore.n_cigar; ic++) { + uint32_t cigOp=(cigar[ic]<>BAM_CIGAR_LengthBits; + uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift; + switch (cigOp) { + case(BAM_CIGAR_D): + case(BAM_CIGAR_N): + aG+=cigL; + break; + case(BAM_CIGAR_M): + if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal + for (uint32_t ig=0;ig=chrLen) { + cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n"; + exit(-1); + }; + if (aNH==1) {//unique mappers + sigAll[aG*sigN+0+2*iStrand]++; + }; + sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci + aG++; + }; + } else { + aG+=cigL; + }; + }; + }; + if (P.outWigFlags.type==1) {//full signal + --aG; + if (aNH==1) {//unique mappers + sigAll[aG*sigN+0+2*iStrand]++; + }; + sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci + }; + }; + delete [] sigAll; + + for (int is=0; isflush(); + sigOutAll[is]->close(); + }; +}; diff --git a/star-sys/STAR/source/signalFromBAM.h b/star-sys/STAR/source/signalFromBAM.h new file mode 100644 index 0000000..0406fdc --- /dev/null +++ b/star-sys/STAR/source/signalFromBAM.h @@ -0,0 +1,13 @@ +#ifndef CODE_signalFromBAM +#define CODE_signalFromBAM +#include "htslib/htslib/sam.h" +#include +#include +#include "Stats.h" +#include "Parameters.h" + +using namespace std; + +void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P); + +#endif diff --git a/star-sys/STAR/source/sjAlignSplit.cpp b/star-sys/STAR/source/sjAlignSplit.cpp new file mode 100644 index 0000000..6ba0a2b --- /dev/null +++ b/star-sys/STAR/source/sjAlignSplit.cpp @@ -0,0 +1,16 @@ +#include "IncludeDefine.h" +#include "Genome.h" + +bool sjAlignSplit(uint a1,uint aLength, const Genome &mapGen, uint &a1D, uint &aLengthD, uint &a1A, uint &aLengthA, uint &isj) { + uint sj1=(a1-mapGen.sjGstart)%mapGen.sjdbLength; + if (sj1mapGen.sjdbOverhang) {//align crosses the junctions + isj=(a1-mapGen.sjGstart)/mapGen.sjdbLength; + aLengthD=mapGen.sjdbOverhang-sj1; + aLengthA=aLength-aLengthD; + a1D=mapGen.sjDstart[isj]+sj1; + a1A=mapGen.sjAstart[isj]; + return true; + } else { + return false; + }; +}; diff --git a/star-sys/STAR/source/sjdbBuildIndex.cpp b/star-sys/STAR/source/sjdbBuildIndex.cpp new file mode 100644 index 0000000..a986d61 --- /dev/null +++ b/star-sys/STAR/source/sjdbBuildIndex.cpp @@ -0,0 +1,331 @@ +#include "sjdbBuildIndex.h" +// #include "sjdbLoadFromStream.h" +// #include "sjdbPrepare.h" +#include "ErrorWarning.h" +#include "SuffixArrayFuns.h" +#include "SequenceFuns.h" +#include "serviceFuns.cpp" +#include "IncludeDefine.h" +#include "streamFuns.h" +#include "binarySearch2.h" +#include "ErrorWarning.h" +#include + +#include "funCompareUintAndSuffixes.h" + +void sjdbBuildIndex (Parameters &P, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi, Genome &mapGen, Genome &mapGen1) { + + #define SPACER_CHAR GENOME_spacingChar + + if (mapGen.sjdbN==0) + {//no junctions to insert + return; + }; + + time_t rawtime; + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " ..... inserting junctions into the genome indices" <logStdOut << timeMonthDayTime(rawtime) << " ..... inserting junctions into the genome indices" < than any genome char + }; + Gsj[nGsj*2]=SPACER_CHAR;//mark the end of the text + + for (uint ii=0; ii=0 || seq1[0][istart]>3) + {//no index for already included junctions, or suffices starting with N + indArray[ind1]=-1; + } else + { + //indArray[ind1] = suffixArraySearch(seq1, istart, mapGen.sjdbLength-istart1, G, SA, true, 0, mapGen.nSA-1, 0, P) ; + indArray[ind1] = suffixArraySearch1(mapGen, seq1, istart, 10000, -1LLU, true, 0, mapGen.nSA-1, 0) ; + //-1LLU results in suffixes for the new junctions to be always included in SA *after* the suffixes of the old junctions + //for identical suffixes, this may result in unstable ordering + indArray[ind1+1] = isj*mapGen.sjdbLength+istart; + }; + }; + }; + sjNew = sjNew/2;//novel junctions were double counted on two strands + + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " Finished SA search: number of new junctions=" << sjNew <<", old junctions="<logMain << timeMonthDayTime(rawtime) << " Finished sorting SA indicesL nInd="<logMain <<"Genome size with junctions="<logMain <<"GstrandBit1="< mapGen.GstrandBit) + {//too many junctions were added - GstrandBit changed + ostringstream errOut; + errOut << "EXITING because of FATAL ERROR: cannot insert junctions on the fly because of strand GstrandBit problem\n"; + errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P); + }; + + SA2.defineBits(mapGen.GstrandBit+1,mapGen.nSA); + uint nGsjNew=sjNew*mapGen.sjdbLength; //this is the actual number of bytes added to the genome, while nGsj is the total size of all junctions + + uint N2bit= 1LLU << mapGen.GstrandBit; + uint strandMask=~N2bit; + + /*testing + PackedArray SAo; + SAo.defineBits(mapGen.GstrandBit+1,mapGen.nSA); + SAo.allocateArray(); + ifstream oldSAin("./DirTrue/SA"); + oldSAin.read(SAo.charArray,SAo.lengthByte); + oldSAin.close(); + */ + + uint isj=0, isa2=0; + for (uint isa=0;isa0 ) + {//- strand + uint ind1s = mapGen1.nGenome - (ind1 & strandMask); + if (ind1s>mapGen.chrStart[mapGen.nChrReal]) + {//this index was an old sj, may need to shift it + uint sj1 = (ind1s-mapGen.chrStart[mapGen.nChrReal])/mapGen.sjdbLength;//old junction index + ind1s += (oldSJind[sj1]-sj1)*mapGen.sjdbLength; + ind1 = (mapGen.nGenome - ind1s) | N2bit; + } else + { + ind1+=nGsjNew; //reverse complementary indices are all shifted by the length of junctions + }; + } else + {//+ strand + if (ind1>mapGen.chrStart[mapGen.nChrReal]) + {//this index was an old sj, may need to shift it + uint sj1 = (ind1-mapGen.chrStart[mapGen.nChrReal])/mapGen.sjdbLength;//old junction index + ind1 += (oldSJind[sj1]-sj1)*mapGen.sjdbLength; + }; + }; + + SA2.writePacked(isa2,ind1); + /*testing + if (SA2[isa2]!=SAo[isa2]) { + cout <logMain << timeMonthDayTime(rawtime) << " Finished inserting junction indices" <0 ) + {//index missing from the old genome + uint iSJ1=iSJ; + int64 ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL); + while (ind1 < (int64)(ii-mapGen.genomeSAindexStart[iL]) && indArray[2*iSJ]-1= (int64) (ii-mapGen.genomeSAindexStart[iL]) ) {//this belongs to the next index + break; + }; + ++iSJ; + }; + + SAi.writePacked(ii,iSA1+iSJ); + + for (uint ii0=ind0+1; ii03) {//this iSA contains N, need to mark the previous + for (uint iL1=iL; iL1 < mapGen.pGe.gSAindexNbases; iL1++) { + ind1+=3; + int64 ind2=mapGen.genomeSAindexStart[iL1]+ind1; + for (; ind2>=0; ind2--) {//find previous index that is not absent + if ( (SAi[ind2] & mapGen.SAiMarkAbsentMaskC)==0 ) { + break; + }; + }; + SAi.writePacked(ind2,SAi[ind2] | mapGen.SAiMarkNmaskC); + ind1 <<= 2; + }; + break; + } else { + ind1 += g; + }; + }; + }; + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <0 && sjdbLoci.chr.size()==0) + {//load from the saved genome, only if the loading did not happen already (if sjdb insertion happens at the 1st pass, sjdbLoci will be populated + ifstream & sjdbStreamIn = ifstrOpen(P.pGe.gDir+"/sjdbList.out.tab", ERROR_OUT, "SOLUTION: re-generate the genome in pGe.gDir=" + P.pGe.gDir, P); + sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); + sjdbLoci.priority.resize(sjdbLoci.chr.size(),30); + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the generated genome " << P.pGe.gDir+"/sjdbList.out.tab" <<": "<logMain, EXIT_CODE_INPUT_FILES, P); + }; + sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); + sjdbLoci.priority.resize(sjdbLoci.chr.size(),0); + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the 1st pass file: " << P.twoPass.pass1sjFile <<": "<logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the pGe.sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n"; + }; + + if (P.pGe.sjdbGTFfile!="-") + {//load from GTF + loadGTF(sjdbLoci, P, P.sjdbInsert.outDir, mapGen); + sjdbLoci.priority.resize(sjdbLoci.chr.size(),20); + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the GTF file: " << P.pGe.sjdbGTFfile<<": "<logMain << timeMonthDayTime(rawtime) << " Finished preparing junctions" <P.limitSjdbInsertNsj) + { + ostringstream errOut; + errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<logMain, EXIT_CODE_INPUT_FILES, P); + }; + //insert junctions into the genome and SA and SAi + sjdbBuildIndex (P, Gsj, mapGen.G, mapGen.SA, (P.twoPass.pass2 ? mapGen.SApass2 : mapGen.SApass1), mapGen.SAi, mapGen, mapGen1); + delete [] Gsj; //junction sequences have been added to G + time ( &rawtime ); + P.inOut->logMain << timeMonthDayTime(rawtime) << " ..... finished inserting junctions into genome" <logMain, EXIT_CODE_INPUT_FILES, P); + }; + + sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); + + P.inOut->logMain << "Loaded database junctions from file: " << P.pGe.sjdbFileChrStartEnd.at(ifile) <<", total number of junctions: "< +#include "SjdbClass.h" +#include "Parameters.h" + +void sjdbLoadFromFiles(Parameters &P, SjdbClass &sjdbLoci); + +#endif diff --git a/star-sys/STAR/source/sjdbLoadFromStream.cpp b/star-sys/STAR/source/sjdbLoadFromStream.cpp new file mode 100644 index 0000000..2421631 --- /dev/null +++ b/star-sys/STAR/source/sjdbLoadFromStream.cpp @@ -0,0 +1,29 @@ +#include "sjdbLoadFromStream.h" +void sjdbLoadFromStream(ifstream &sjdbStreamIn, SjdbClass &sjdbLoci) { + while (sjdbStreamIn.good()) { + string oneLine,chr1; + uint u1,u2; + char str1; + getline(sjdbStreamIn,oneLine); + istringstream oneLineStream (oneLine); + oneLineStream >> chr1 >> u1 >> u2 >> str1; + if (chr1!="") { + sjdbLoci.chr.push_back(chr1); + sjdbLoci.start.push_back(u1); + sjdbLoci.end.push_back(u2); + switch (str1) {//convert numbers to symbols + case '1': + case '+': + str1='+'; + break; + case '2': + case '-': + str1='-'; + break; + default: + str1='.'; + }; + sjdbLoci.str.push_back(str1); + }; + }; +}; \ No newline at end of file diff --git a/star-sys/STAR/source/sjdbLoadFromStream.h b/star-sys/STAR/source/sjdbLoadFromStream.h new file mode 100644 index 0000000..a92dfff --- /dev/null +++ b/star-sys/STAR/source/sjdbLoadFromStream.h @@ -0,0 +1,8 @@ +#ifndef CODE_sjdbLoadFromStream +#define CODE_sjdbLoadFromStream + +#include +#include "SjdbClass.h" +void sjdbLoadFromStream(ifstream &sjdbStreamIn, SjdbClass &sjdbLoci); + +#endif diff --git a/star-sys/STAR/source/sjdbPrepare.cpp b/star-sys/STAR/source/sjdbPrepare.cpp new file mode 100644 index 0000000..47b0e68 --- /dev/null +++ b/star-sys/STAR/source/sjdbPrepare.cpp @@ -0,0 +1,225 @@ +#include "sjdbPrepare.h" +#include "ErrorWarning.h" +#include "serviceFuns.cpp" + +void sjdbPrepare (SjdbClass &sjdbLoci, Parameters &P, uint nGenomeReal, string outDir, Genome &mapGen, char *Gsj) { + + char *G=mapGen.G; + + uint *sjdbS=new uint [sjdbLoci.chr.size()]; + uint *sjdbE=new uint [sjdbLoci.chr.size()]; + + uint8 *sjdbMotif=new uint8 [sjdbLoci.chr.size()]; + uint8 *sjdbShiftLeft=new uint8 [sjdbLoci.chr.size()]; + uint8 *sjdbShiftRight=new uint8 [sjdbLoci.chr.size()]; + + + string chrOld=""; + uint iChr=0; + for (uint ii=0;ii=mapGen.nChrReal) { + ostringstream errOut; + errOut << "EXITING because of FATAL error, the sjdb chromosome " << sjdbLoci.chr.at(ii) << " is not found among the genomic chromosomes\n"; + errOut << "SOLUTION: fix your file(s) --sjdbFileChrStartEnd or --sjdbGTFfile, offending junction:" <logMain, EXIT_CODE_INPUT_FILES, P); + }; + chrOld=sjdbLoci.chr.at(ii); + }; + + sjdbS[ii] = sjdbLoci.start.at(ii) + mapGen.chrStart[iChr] - 1;//sj names contain 1-based intron loci + sjdbE[ii] = sjdbLoci.end.at(ii) + mapGen.chrStart[iChr] - 1; + + //motifs + if ( G[sjdbS[ii]]==2 && G[sjdbS[ii]+1]==3 && G[sjdbE[ii]-1]==0 && G[sjdbE[ii]]==2 ) {//GTAG + sjdbMotif[ii]=1; + } else if ( G[sjdbS[ii]]==1 && G[sjdbS[ii]+1]==3 && G[sjdbE[ii]-1]==0 && G[sjdbE[ii]]==1 ) {//CTAC + sjdbMotif[ii]=2; + } else if ( G[sjdbS[ii]]==2 && G[sjdbS[ii]+1]==1 && G[sjdbE[ii]-1]==0 && G[sjdbE[ii]]==2 ) {//GCAG + sjdbMotif[ii]=3; + } else if ( G[sjdbS[ii]]==1 && G[sjdbS[ii]+1]==3 && G[sjdbE[ii]-1]==2 && G[sjdbE[ii]]==1 ) {//CTGC + sjdbMotif[ii]=4; + } else if ( G[sjdbS[ii]]==0 && G[sjdbS[ii]+1]==3 && G[sjdbE[ii]-1]==0 && G[sjdbE[ii]]==1 ) {//ATAC + sjdbMotif[ii]=5; + } else if ( G[sjdbS[ii]]==2 && G[sjdbS[ii]+1]==3 && G[sjdbE[ii]-1]==0 && G[sjdbE[ii]]==3 ) {//GTAT + sjdbMotif[ii]=6; + } else { + sjdbMotif[ii]=0; + }; + //repeat length: go back and forth around jR to find repeat length + uint jjL=0,jjR=0; + while ( jjL <= sjdbS[ii]-1 && G[sjdbS[ii]-1-jjL]==G[sjdbE[ii]-jjL] && G[sjdbS[ii]-1-jjL]<4 && jjL<255) {//go back + jjL++; + }; + sjdbShiftLeft[ii]=jjL; + + while ( sjdbS[ii]+jjR < nGenomeReal && G[sjdbS[ii]+jjR]==G[sjdbE[ii]+1+jjR] && G[sjdbS[ii]+jjR]<4 && jjR<255) {//go forward + jjR++; + }; + sjdbShiftRight[ii]=jjR; + + + if (jjR==255 || jjL==255) { + P.inOut->logMain << "WARNING: long repeat for junction # " << ii+1 <<" : " \ + << sjdbLoci.chr.at(ii) <<" "<0) + { + isj0=I[nsj-1]; //index of the last recorded junctions + }; + + if (nsj==0 || sjdbS[isj]!=sjdbS[isj0] || sjdbE[isj]!=sjdbE[isj0]) + {//different intron coordinates + I[nsj++]=isj;// add new junction + } else if (sjdbLoci.priority.at(isj)sjdbLoci.priority.at(isj0)) + {//new junction has higher priority + I[nsj-1]=isj;//replace the old junction + } else if ( (sjdbMotif[isj]>0 && sjdbMotif[isj0]==0) \ + || ( ((sjdbMotif[isj]>0) == (sjdbMotif[isj0]>0)) && sjdbShiftLeft[isj]0 && mapGen.sjdbStart[nsj1-1]==sjdbSort[ii*3] && mapGen.sjdbEnd[nsj1-1]==sjdbSort[ii*3+1] ) {//same loci on opposite strands + uint isj0=sjdbSort[(ii-1)*3+2]; + + if (sjdbLoci.priority.at(isj)sjdbLoci.priority.at(isj0)) + {//new junction has higher priority + nsj1--;//replace the old junction with the new one + } else if (mapGen.sjdbStrand[nsj1-1]>0 && sjdbLoci.str.at(isj)=='.') + {//new junction strand is not defined + continue; + } else if (mapGen.sjdbStrand[nsj1-1]==0 && sjdbLoci.str.at(isj)!='.') + {//old junction strand is not defined + nsj1--; //replace old with new + } else if (mapGen.sjdbMotif[nsj1-1]==0 && sjdbMotif[isj]==0) + {//both are non-canonical (on opposite strand) + mapGen.sjdbStrand[nsj1-1]=0;//do not record new junction, keep old with undefined strand + continue; + } else if ( (mapGen.sjdbMotif[nsj1-1]>0 && sjdbMotif[isj]==0) ||(mapGen.sjdbMotif[nsj1-1]%2 == (2-mapGen.sjdbStrand[nsj1-1])) ){//both strands defined, both junctions canonical + //old junction is canonical, new is not, OR old junction is on correct strand + continue; + } else { + //new junction is on correct strand, replace the old one + nsj1--; + }; + }; + + //record junction + mapGen.sjdbStart[nsj1]=sjdbSort[ii*3]; + mapGen.sjdbEnd[nsj1]=sjdbSort[ii*3+1]; + mapGen.sjdbMotif[nsj1]=sjdbMotif[isj]; + mapGen.sjdbShiftLeft[nsj1]=sjdbShiftLeft[isj]; + mapGen.sjdbShiftRight[nsj1]=sjdbShiftRight[isj]; + if (sjdbLoci.str.at(isj)=='+') { + mapGen.sjdbStrand[nsj1]=1; + } else if (sjdbLoci.str.at(isj)=='-') { + mapGen.sjdbStrand[nsj1]=2; + } else { + if (mapGen.sjdbMotif[nsj1]==0) {//strand un-defined + mapGen.sjdbStrand[nsj1]=0; + } else { + mapGen.sjdbStrand[nsj1]=2-mapGen.sjdbMotif[nsj1]%2; + }; + }; + nsj1++; + }; + mapGen.sjdbN=nsj1; + mapGen.sjDstart = new uint [mapGen.sjdbN]; + mapGen.sjAstart = new uint [mapGen.sjdbN]; + + ofstream sjdbInfo((outDir+"/sjdbInfo.txt").c_str()); + ofstream sjdbList ((outDir+"/sjdbList.out.tab").c_str()); + char strandChar[3]={'.','+','-'}; + //first line is some general useful information + sjdbInfo << mapGen.sjdbN <<"\t"<< mapGen.sjdbOverhang <<"\n"; + uint sjGstart=0; + + for (uint ii=0;ii> P.pGe.gChrBinNbits]; + sjdbList << mapGen.chrName[chr1]<< "\t" << mapGen.sjdbStart[ii]-mapGen.chrStart[chr1] + 1 + (mapGen.sjdbMotif[ii]>0 ? 0:mapGen.sjdbShiftLeft[ii]) \ + << "\t"<< mapGen.sjdbEnd[ii]-mapGen.chrStart[chr1] + 1 + (mapGen.sjdbMotif[ii]>0 ? 0:mapGen.sjdbShiftLeft[ii]) \ + << "\t"<< strandChar[mapGen.sjdbStrand[ii]]<<"\n"; + }; + sjdbInfo.close(); + sjdbList.close(); + +}; + diff --git a/star-sys/STAR/source/sjdbPrepare.h b/star-sys/STAR/source/sjdbPrepare.h new file mode 100644 index 0000000..51d8030 --- /dev/null +++ b/star-sys/STAR/source/sjdbPrepare.h @@ -0,0 +1,10 @@ +#ifndef CODE_sjdbPrepare +#define CODE_sjdbPrepare + +#include "SjdbClass.h" +#include "Parameters.h" +#include "Genome.h" + +void sjdbPrepare (SjdbClass &sjdbLoci, Parameters &P, uint nGenomeReal, string outDir, Genome &mapGen, char *Gsj); + +#endif \ No newline at end of file diff --git a/star-sys/STAR/source/sortSuffixesBucket.h b/star-sys/STAR/source/sortSuffixesBucket.h new file mode 100644 index 0000000..bfbc326 --- /dev/null +++ b/star-sys/STAR/source/sortSuffixesBucket.h @@ -0,0 +1,3 @@ +#include + +void sortSuffixesBucket(char *G, void *ind, int indN, int indSkip); \ No newline at end of file diff --git a/star-sys/STAR/source/stitchAlignToTranscript.cpp b/star-sys/STAR/source/stitchAlignToTranscript.cpp new file mode 100644 index 0000000..ae7da94 --- /dev/null +++ b/star-sys/STAR/source/stitchAlignToTranscript.cpp @@ -0,0 +1,410 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "extendAlign.h" +#include "binarySearch2.h" +// #include "stitchGapIndel.cpp" + + +intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, const Parameters& P, char* R, const Genome &mapGen, Transcript *trA, const uint outFilterMismatchNmaxTotal) { + //stitch together A and B, extend in the gap, returns max score + + if (trA->nExons>=MAX_N_EXONS) + return -1000010; + + char *G=mapGen.G; + int Score=0; + + if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]==sjAB \ + && trA->exons[trA->nExons-1][EX_iFrag]==iFragB && rBstart==rAend+1 && gAend+1exons[trA->nExons-1][EX_L]<=mapGen.sjdbShiftLeft[sjAB]) ) { + return -1000006; //too large repeats around non-canonical junction + }; + trA->exons[trA->nExons][EX_L] = L; //new exon length + trA->exons[trA->nExons][EX_R] = rBstart; //new exon r-start + trA->exons[trA->nExons][EX_G] = gBstart; //new exon g-start + trA->canonSJ[trA->nExons-1]=mapGen.sjdbMotif[sjAB]; //mark sj-db + trA->shiftSJ[trA->nExons-1][0]=mapGen.sjdbShiftLeft[sjAB]; + trA->shiftSJ[trA->nExons-1][1]=mapGen.sjdbShiftRight[sjAB]; + trA->sjAnnot[trA->nExons-1]=1; + trA->sjStr[trA->nExons-1]=mapGen.sjdbStrand[sjAB];; + trA->nExons++; + trA->nMatch+=L; + for (uint ii=rBstart;iisjAnnot[trA->nExons-1]=0; + trA->sjStr[trA->nExons-1]=0; + + if (trA->exons[trA->nExons-1][EX_iFrag]==iFragB) {//stitch aligns on the same fragment + uint gBend=gBstart+L-1; + uint rBend=rBstart+L-1; + +// {//debug +// if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]!=((uint) -1) && rBend<=rAend) {// +// Score -= rAend-rBstart+1; +// gAend -= rAend-rBstart+1; +// rAend = rBstart-1; +// trA->exons[trA->nExons-1][EX_L] =rAend-trA->exons[trA->nExons-1][EX_R]+1; +// }; +// }; + + //check if r-overlapping fully and exit + if (rBend<=rAend) return -1000001; + if (gBend<=gAend && trA->exons[trA->nExons-1][EX_iFrag]==iFragB) return -1000002; + + //shift the B 5' if overlaps A 3' + if (rBstart<=rAend) { + gBstart+=rAend-rBstart+1; + rBstart=rAend+1; + L=rBend-rBstart+1; + }; + + for (uint ii=rBstart;ii<=rBend;ii++) Score+=scoreMatch; //add QS for mapped portions + + int gGap=gBstart-gAend-1; //could be < 0 for insertions + int rGap=rBstart-rAend-1;//>0 always since we removed overlap + + uint nMatch=L; + uint nMM=0; + uint Del=0, Ins=0; + uint nIns=0, nDel=0; + int jR=0; //junction location in R-space + int jCan=999; //canonical junction type + uint gBstart1=gBstart-rGap-1;//the last base of the intron if all read gap belongs to acceptor, i.e. jR=0 + + + // check all the different combinations of gGap and rGap + if ( gGap==0 && rGap==0 ) {//just joined the pieces, w/o stiching or gaps + //do nothing for now + } else if ( gGap>0 && rGap>0 && rGap==gGap ) {//no gaps, just try to fill space + //simple stitching, assuming no insertion in the read + + for (int ii=1;ii<=rGap;ii++) { + if (G[gAend+ii]<4 && R[rAend+ii]<4) {//only score genome bases that are not Ns + if ( R[rAend+ii]==G[gAend+ii] ) { + Score+=scoreMatch; + nMatch++; + } else { + Score-=scoreMatch; + nMM++; + }; + }; + }; + + } else if ( gGap>rGap ) {//genomic gap (Deletion) + + nDel=1; + Del=gGap-rGap; //gGap>0 here + + if (Del>P.alignIntronMax && P.alignIntronMax>0) { + return -1000003; //large gaps not allowed + }; + + int Score1=0; + int jR1=1; //junction location in R-space + do { // 1. move left, until the score for MM is less than canonical advantage + jR1--; + if ( R[rAend+jR1]!=G[gBstart1+jR1] && G[gBstart1+jR1]<4 && R[rAend+jR1]==G[gAend+jR1]) Score1 -= scoreMatch; + } while ( Score1+P.scoreStitchSJshift >= 0 && int(trA->exons[trA->nExons-1][EX_L]) + jR1 > 1);//>=P.alignSJoverhangMin); //also check that we are still within the exon + + int maxScore2=-999999; + Score1=0; + int jPen=0; + do { // 2. scan to the right to find the best junction locus + // ?TODO? if genome base is N, how to score? + if ( R[rAend+jR1]==G[gAend+jR1] && R[rAend+jR1]!=G[gBstart1+jR1] ) Score1+=scoreMatch; + if ( R[rAend+jR1]!=G[gAend+jR1] && R[rAend+jR1]==G[gBstart1+jR1] ) Score1-=scoreMatch; + + int jCan1=-1; //this marks Deletion + int jPen1=0; + int Score2=Score1; + + if (Del>=P.alignIntronMin) {//only check intron motif for large gaps= non-Dels + //check if the intron is canonical, or semi-canonical + if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GTAG + jCan1=1; + } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//CTAC + jCan1=2; + } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==1 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GCAG + jCan1=3; + jPen1=P.scoreGapGCAG; + } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==2 && G[gBstart1+jR1]==1 ) {//CTGC + jCan1=4; + jPen1=P.scoreGapGCAG; + } else if ( G[gAend+jR1+1]==0 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//ATAC + jCan1=5; + jPen1=P.scoreGapATAC; + } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==3 ) {//GTAT + jCan1=6; + jPen1=P.scoreGapATAC; + } else { + jCan1=0; + jPen1=P.scoreGapNoncan; + }; + + Score2 += jPen1; + }; + + if (maxScore2 < Score2 ) {//check if the score is the highest. TODO: record the next highest score + maxScore2=Score2; + jR=jR1; //this is the last base of donor + jCan=jCan1; + jPen=jPen1; + }; + jR1++; + } while ( jR1 < int(rBend) - int(rAend) );// - int(P.alignSJoverhangMin) );//TODO: do not need to search the full B-transcript, can stop as soon as Score goes down by more than + + //repeat length: go back and forth around jR to find repeat length + uint jjL=0,jjR=0; + while ( gAend+jR>=jjL && G[gAend-jjL+jR]==G[gBstart1-jjL+jR] && G[gAend-jjL+jR]<4 && jjL<=MAX_SJ_REPEAT_SEARCH) {//go back + jjL++; + }; + + while ( gAend+jjR+jR+1exons[trA->nExons-1][EX_L])+jR<1) return -1000005; + jjR+=jjL; + jjL=0; + }; + + //TODO check here if the internal exon length < minDa, if so exit w/o stitiching + + for (int ii=min(1,jR+1);ii<=max(rGap,jR);ii++) {//score donor and acceptor + uint g1=(ii<=jR) ? (gAend+ii):(gBstart1+ii); + if (G[g1]<4 && R[rAend+ii]<4) {//only penalize non-N bases + if ( R[rAend+ii]==G[g1] ) { + if (ii>=1 && ii <=rGap) {//only add +score and matches within the gap + Score+=scoreMatch; + nMatch++; + }; + } else {//add -score and MM for all bases + Score-=scoreMatch; + nMM++; + if (ii<1 || ii>rGap) {//subtract previuosly presumed matches + Score-=scoreMatch; + nMatch--; +// if (ii<=jR) nMM--; + }; + }; + }; + }; + + //score the gap + if (mapGen.sjdbN>0) {//check if the junction is annotated + uint jS=gAend+jR+1, jE=gBstart1+jR;//intron start/end + int sjdbInd=binarySearch2(jS,jE,mapGen.sjdbStart,mapGen.sjdbEnd,mapGen.sjdbN); + if (sjdbInd<0) { + if (Del>=P.alignIntronMin) { + Score += P.scoreGap + jPen; //genome gap penalty + non-canonical penalty + } else {//deletion + Score += Del*P.scoreDelBase + P.scoreDelOpen; + jCan=-1; + trA->sjAnnot[trA->nExons-1]=0; +// jjR-=jjL; +// jR-=jjL; +// jjL=0; +// trA->shiftSJ[trA->nExons-1][0]=0; +// trA->shiftSJ[trA->nExons-1][1]=jjR; + }; + } else {//annotated + jCan=mapGen.sjdbMotif[sjdbInd]; + if (mapGen.sjdbMotif[sjdbInd]==0) {//shift to match annotations + if (L<=mapGen.sjdbShiftLeft[sjdbInd] || trA->exons[trA->nExons-1][EX_L]<=mapGen.sjdbShiftLeft[sjdbInd]) { + return -1000006; + }; + jR += (int) mapGen.sjdbShiftLeft[sjdbInd]; + jjL=mapGen.sjdbShiftLeft[sjdbInd]; + jjR=mapGen.sjdbShiftRight[sjdbInd]; + }; + trA->sjAnnot[trA->nExons-1]=1; + trA->sjStr[trA->nExons-1]=mapGen.sjdbStrand[sjdbInd]; + Score += P.pGe.sjdbScore; + }; + } else {//no annotation + if (Del>=P.alignIntronMin) {//junction, not short deletion + Score += P.scoreGap + jPen; + } else { + Score += Del*P.scoreDelBase + P.scoreDelOpen; + jCan=-1; + trA->sjAnnot[trA->nExons-1]=0; + }; + }; + + trA->shiftSJ[trA->nExons-1][0]=jjL; + trA->shiftSJ[trA->nExons-1][1]=jjR; + trA->canonSJ[trA->nExons-1]=jCan; + + if (trA->sjAnnot[trA->nExons-1]==0) {//strand for unannotated junctions + if (jCan>0) { + trA->sjStr[trA->nExons-1]=2-jCan%2; //1=+,2=- + } else { + trA->sjStr[trA->nExons-1]=0; + }; + }; + + } else if ( rGap>gGap ) {//insertion: if also gGap>0, need to stitch + Ins=rGap-gGap; + nIns=1; + if (gGap==0) {//simple insertion, no need to stitch + jR=0; + } else if (gGap<0) {//overlapping seeds: reduce the score + jR=0; + for (int ii=0; ii<-gGap; ii++) { + Score -= scoreMatch; + }; + } else {//stitch: define the exon boundary jR + int Score1=0; int maxScore1=0; + for (int jR1=1;jR1<=gGap;jR1++) {//scan to the right to find the best score + + if (G[gAend+jR1]<4) {//only penalize goog genome bases + Score1+=( R[rAend+jR1]==G[gAend+jR1] ) ? scoreMatch:-scoreMatch; + Score1+=( R[rAend+Ins+jR1]==G[gAend+jR1] ) ? -scoreMatch:+scoreMatch; + }; + + if (Score1>maxScore1 || (Score1==maxScore1 && P.alignInsertionFlush.flushRight)) {//equal sign (>=) flushes insertions to the right + maxScore1=Score1; + jR=jR1; + }; + }; + for (int ii=1;ii<=gGap;ii++) {//score donor and acceptor + uint r1=rAend+ii+(ii<=jR ? 0:Ins); + if (G[gAend+ii]<4 && R[r1]<4) { + if ( R[r1]==G[gAend+ii] ) { + Score+=scoreMatch; + nMatch++; + } else {//add -score and MM for all bases + Score-=scoreMatch; + nMM++; + }; + }; + }; + }; + + if (P.alignInsertionFlush.flushRight) { + for (; jR<(int)rBend-(int)rAend-(int)Ins; jR++ ){//flush the indel to the right as much as possible + if (R[rAend+jR+1]!=G[gAend+jR+1] || G[gAend+jR+1]==4) { + break; + }; + }; + if (jR==(int)rBend-(int)rAend-(int)Ins) {//nothing left of the B-piece + return -1000009; + }; + }; + Score += Ins*P.scoreInsBase + P.scoreInsOpen; + jCan=-2; //marks insertion though it's not used below + }; //different types of gaps selection + + + + #ifdef COMPILE_FOR_LONG_READS + if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal ) +// if ( Score>0 && nMM<=200 ) + + #else + if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal \ + && ( jCan<0 || (jCan<7 && nMM<= (uint) P.alignSJstitchMismatchNmax[(jCan+1)/2]) ) ) + #endif + {//stitching worked only if there no mis-matches for non-GT/AG junctions + trA->nMM += nMM; + trA->nMatch += nMatch; + + if (Del>=P.alignIntronMin) { + trA->nGap += nDel; + trA->lGap += Del; + } else { + trA->nDel += nDel; + trA->lDel += Del; + }; + + //modify exons + if (Del==0 && Ins==0) {//no gap => no new exon, extend the boundary of the previous exon + trA->exons[trA->nExons-1][EX_L] += rBend-rAend; + } else if (Del>0) { //deletion:ca only have Del> or Ins>0 + trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary + trA->exons[trA->nExons][EX_L] = rBend-rAend-jR; //new exon length + trA->exons[trA->nExons][EX_R] = rAend+jR+1; //new exon r-start + trA->exons[trA->nExons][EX_G] = gBstart1+jR+1; //new exon g-start + trA->nExons++; + } else if (Ins>0) { //Ins>0; + trA->nIns += nIns; + trA->lIns += Ins; + trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary + trA->exons[trA->nExons][EX_L] = rBend-rAend-jR-Ins; //new exon length + trA->exons[trA->nExons][EX_R] = rAend+jR+Ins+1; //new exon r-start + trA->exons[trA->nExons][EX_G] = gAend+1+jR; //new exon g-start + trA->canonSJ[trA->nExons-1]=-2; //mark insertion + trA->sjAnnot[trA->nExons-1]=0; + trA->nExons++; + }; + } else {//stitching was not accepted + return -1000007; + }; + } else if (gBstart+trA->exons[0][EX_R]+P.alignEndsProtrude.nBasesMax >= trA->exons[0][EX_G] || trA->exons[0][EX_G] < trA->exons[0][EX_R]){//if (iFragA==iFragB) stitch aligns from different fragments + //CHECK: this second confdition does not make sense + if (P.alignMatesGapMax>0 && gBstart > trA->exons[trA->nExons-1][EX_G] + trA->exons[trA->nExons-1][EX_L] + P.alignMatesGapMax) { + return -1000004; //gap between mates too large + }; + //extend the fragments inside + //note, that this always works, i.e. Score>0 + + for (uint ii=rBstart;ii1 + //TTCTGTGTCTCCCCCTCCCCCACTGGCTACATGGAGACAGGGGGGGGGGGCCGGGCGGTTCCCGGGCAGAAAAAAA + //>1 + //AATATTTGGAACACTTATGTGAAAAATGATTTGTTTTTCTGAAATTTACGTTTCTCTCTGAGTCCTGTAACTGTCC + + + trExtend.reset(); + if ( extendAlign(R, G, rAend+1, gAend+1, 1, 1, DEF_readSeqLengthMax, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[trA->exons[trA->nExons-1][EX_iFrag]][1], &trExtend) ) { + + trA->add(&trExtend); + Score += trExtend.maxScore; + + trA->exons[trA->nExons-1][EX_L] += trExtend.extendL; + };// if extendAlign for read A + + trA->exons[trA->nExons][EX_R] = rBstart; + trA->exons[trA->nExons][EX_G] = gBstart; + trA->exons[trA->nExons][EX_L] = L; + trA->nMatch += L; + + trExtend.reset(); + //if end extension needs to be forced, use large length. Otherwise, only extend until the beginning of the transcript + uint extlen=P.alignEndsType.ext[iFragB][1] ? DEF_readSeqLengthMax : gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R]; + if ( extendAlign(R, G, rBstart-1, gBstart-1, -1, -1, extlen, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[iFragB][1], &trExtend) ) { + + trA->add(&trExtend); + Score += trExtend.maxScore; + + trA->exons[trA->nExons][EX_R] -= trExtend.extendL; + trA->exons[trA->nExons][EX_G] -= trExtend.extendL; + trA->exons[trA->nExons][EX_L] += trExtend.extendL; + }; //if extendAlign B + + trA->canonSJ[trA->nExons-1]=-3; //mark different fragments junction + trA->sjAnnot[trA->nExons-1]=0; + + trA->nExons++; + } else {//no stitching possible + return -1000008; + }; + }; + + trA->exons[trA->nExons-1][EX_iFrag]=iFragB; //the new exon belongs to fragment iFragB + trA->exons[trA->nExons-1][EX_sjA]=sjAB; + + return Score; +}; diff --git a/star-sys/STAR/source/stitchAlignToTranscript.h b/star-sys/STAR/source/stitchAlignToTranscript.h new file mode 100644 index 0000000..2ccd935 --- /dev/null +++ b/star-sys/STAR/source/stitchAlignToTranscript.h @@ -0,0 +1,7 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "Genome.h" + +intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, const Parameters& P, char* R, const Genome &mapGen, Transcript *trA, uint outFilterMismatchNmaxTotal); + diff --git a/star-sys/STAR/source/stitchGapIndel.cpp b/star-sys/STAR/source/stitchGapIndel.cpp new file mode 100644 index 0000000..ed93314 --- /dev/null +++ b/star-sys/STAR/source/stitchGapIndel.cpp @@ -0,0 +1,59 @@ +#include "IncludeDefine.h" +#include "Parameters.h" + +int stitchGapIndel (uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint gapStart, uint gapEnd, char* R, char* G, Parameters& P,\ + uint &iRbest, uint &nMM){//returns stitch score + + uint gapLength = gapEnd-gapStart+1; + sint inDel= (sint) (gBstart-gAend-1) - (sint) gapLength - (sint) (rBstart-rAend-1); //>0: deletion; <0: insertion + + if (inDel==0) {//this should not happen, it should have been caught in the first stitching + return -1; + }; + int score2best; + int score2; + + if (inDel>0) {// + score2=0; + score2best=-1; + iRbest=0; + for (uint iR=1; iR=gapStart) iG1 += gapLength;//exclude gap + if (iG2>=gapStart) iG2 += gapLength; + + if ( R[rAend+iR]==G[iG1] && R[rAend+iR]!=G[iG2] ) { + score2++; + } else if ( R[rAend+iR]!=G[iG1] && R[rAend+iR]==G[iG2] ) { + score2--; + }; + + if (score2>score2best) { + score2best=score2; + iRbest=iR; + }; + }; + + //score the alignment with inDel at iRbest + nMM=0; + score2= L - inDel*P.scoreDelBase - P.scoreDelOpen; //score B and deletion + for (uint iR=1; iRiRbest) iG += (uint) inDel; + if (iG>=gapStart) iG += gapLength;//exclude gap + + if ( R[rAend+iR]==G[iG] ) { + score2++; + } else if (R[rAend+iR]!=G[iG] && R[rAend+iR]<4 && G[iG]<4) {//only penalize mismatches for non-N bases + score2--; + nMM++; + }; + }; + + } else { + return -1; + }; + + return score2; +}; diff --git a/star-sys/STAR/source/stitchWindowAligns.cpp b/star-sys/STAR/source/stitchWindowAligns.cpp new file mode 100644 index 0000000..df024dc --- /dev/null +++ b/star-sys/STAR/source/stitchWindowAligns.cpp @@ -0,0 +1,353 @@ +#include "stitchWindowAligns.h" +#include "blocksOverlap.h" +#include "ErrorWarning.h" +#include "binarySearch2.h" +#include +#include + +void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \ + uint Lread, uiWA* WA, char* R, const Genome &mapGen, \ + const Parameters& P, Transcript** wTr, uint* nWinTr, ReadAlign *RA) { + //recursively stitch aligns for one gene + //*nWinTr - number of transcripts for the current window + + if (iA>=nA && tR2==0) return; //no aligns in the transcript + + if (iA>=nA) {//no more aligns to add, finalize the transcript + + //extend first + Transcript trAstep1; + + int vOrder[2]; //decide in which order to extend: extend the 5' of the read first + + #if EXTEND_ORDER==1 + if ( trA.roStr==0 ) {//decide in which order to extend: extend the 5' of the read first + vOrder[0]=0; vOrder[1]=1; + } else { + vOrder[0]=1; vOrder[1]=0; + }; + #elif EXTEND_ORDER==2 + vOrder[0]=0; vOrder[1]=1; + #else + #error "EXTEND_ORDER value unrecognized" + #endif + + for (int iOrd=0;iOrd<2;iOrd++) { + + switch (vOrder[iOrd]) { + + case 0: //extend at start + + if (trA.rStart>0) {// if transcript does not start at base, extend to the read start + trAstep1.reset(); + uint imate=trA.exons[0][EX_iFrag]; + if ( extendAlign(R, mapGen.G, trA.rStart-1, trA.gStart-1, -1, -1, trA.rStart, tR2-trA.rStart+1, \ + trA.nMM, RA->outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[imate][(int)(trA.Str!=imate)], &trAstep1) ) {//if could extend + + trA.add(&trAstep1); + Score += trAstep1.maxScore; + + trA.exons[0][EX_R] = trA.rStart = trA.rStart - trAstep1.extendL; + trA.exons[0][EX_G] = trA.gStart = trA.gStart - trAstep1.extendL; + trA.exons[0][EX_L] += trAstep1.extendL; + + }; + //TODO penalize the unmapped bases at the start + }; + break; + + case 1: //extend at end + + if ( tR2outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ + P.alignEndsType.ext[imate][(int)(imate==trA.Str)], &trAstep1) ) {//if could extend + + trA.add(&trAstep1); + Score += trAstep1.maxScore; + + tR2 += trAstep1.extendL; + tG2 += trAstep1.extendL; + + trA.exons[trA.nExons-1][EX_L] += trAstep1.extendL;//extend the length of the last exon + + }; + //TODO penalize unmapped bases at the end + }; + }; + }; + + if (!P.alignSoftClipAtReferenceEnds.yes && \ + ( (trA.exons[trA.nExons-1][EX_G] + Lread-trA.exons[trA.nExons-1][EX_R]) > (mapGen.chrStart[trA.Chr]+mapGen.chrLength[trA.Chr]) || \ + trA.exons[0][EX_G]<(mapGen.chrStart[trA.Chr]+trA.exons[0][EX_R]) ) ) { + return; //no soft clipping past the ends of the chromosome + }; + + + trA.rLength = 0; + for (uint isj=0;isj=0 ) {//junction + if (trA.sjAnnot[isj]==1) {//sjdb + if ( ( trA.exons[isj][EX_L] < P.alignSJDBoverhangMin && (isj==0 || trA.canonSJ[isj-1]==-3 || (trA.sjAnnot[isj-1]==0 && trA.canonSJ[isj-1]>=0) ) )\ + || ( trA.exons[isj+1][EX_L] < P.alignSJDBoverhangMin && (isj==trA.nExons-2 || trA.canonSJ[isj+1]==-3 || (trA.sjAnnot[isj+1]==0 && trA.canonSJ[isj+1]>=0) ) ) )return; + } else {//non-sjdb + if ( trA.exons[isj][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][0] \ + || trA.exons[isj+1][EX_L] < P.alignSJoverhangMin + trA.shiftSJ[isj][1] ) return; + }; + }; + }; + if (trA.nExons>1 && trA.sjAnnot[trA.nExons-2]==1 && trA.exons[trA.nExons-1][EX_L] < P.alignSJDBoverhangMin) return; //this exon was not checkedin the cycle above + + //filter strand consistency + uint sjN=0; + trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0; + for (uint iex=0;iex=0) + {//junctions - others are indels + sjN++; + trA.intronMotifs[trA.sjStr[iex]]++; + }; + }; + + if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]==0) + trA.sjMotifStrand=1; + else if (trA.intronMotifs[1]==0 && trA.intronMotifs[2]>0) + trA.sjMotifStrand=2; + else + trA.sjMotifStrand=0; + + if (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0 && P.outFilterIntronStrands=="RemoveInconsistentStrands") + return; + + if (sjN>0 && trA.sjMotifStrand==0 && P.outSAMstrandField.type==1) {//strand not defined for a junction + return; + }; + + if (P.outFilterIntronMotifs=="None") {//no filtering + + } else if (P.outFilterIntronMotifs=="RemoveNoncanonical") { + for (uint iex=0;iexlogMain, EXIT_CODE_INPUT_FILES, P); + }; + + {//check mapped length for each mate + uint nsj=0,exl=0; + for (uint iex=0;iex0 && (exlreadLength[trA.exons[iex][EX_iFrag]])) ) { + return; //do not record this transcript + }; + exl=0;nsj=0; + } else if (trA.canonSJ[iex]>=0) { + nsj++; + }; + }; + }; + + if (P.outFilterBySJoutStage==2) {//junctions have to be present in the filtered set P.sjnovel + for (uint iex=0;iex=0 && trA.sjAnnot[iex]==0) { + uint jS=trA.exons[iex][EX_G]+trA.exons[iex][EX_L]; + uint jE=trA.exons[iex+1][EX_G]-1; + if ( binarySearch2(jS,jE,P.sjNovelStart,P.sjNovelEnd,P.sjNovelN) < 0 ) return; + }; + }; + }; + + if ( trA.exons[0][EX_iFrag]!=trA.exons[trA.nExons-1][EX_iFrag] ) {//check for correct overlap between mates + if (trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] <= trA.exons[0][EX_G]) return; //to avoid negative insert size + uint iexM2=trA.nExons; + for (uint iex=0;iex trA.exons[iexM2][EX_G] ) {//mates overlap - check consistency of junctions + + if (trA.exons[0][EX_G] > \ + trA.exons[iexM2][EX_G]+trA.exons[0][EX_R]+P.alignEndsProtrude.nBasesMax) return; //LeftMateStart > RightMateStart + allowance + if (trA.exons[iexM2-1][EX_G]+trA.exons[iexM2-1][EX_L] > \ + trA.exons[trA.nExons-1][EX_G]+Lread-trA.exons[trA.nExons-1][EX_R]+P.alignEndsProtrude.nBasesMax) return; //LeftMateEnd > RightMateEnd +allowance + + //check for junctions consistency + uint iex1=1, iex2=iexM2+1; //last exons of the junction + for (; iex1= trA.exons[iex2-1][EX_G] + trA.exons[iex2-1][EX_L]) break; + }; + while (iex1maxScoreMate[trA.iFrag] = max (RA->maxScoreMate[trA.iFrag] , Score); + } else { + trA.iFrag=-1; + }; + + //Variation + //Score+=trA.variationAdjust(mapGen, R); + + trA.maxScore=Score; + + // transcript has been finalized, compare the score and record + if ( Score+P.outFilterMultimapScoreRange >= wTr[0]->maxScore \ + || ( trA.iFrag>=0 && Score+P.outFilterMultimapScoreRange >= RA->maxScoreMate[trA.iFrag] ) \ + || P.pCh.segmentMin>0) { + //only record the transcripts within the window that are in the Score range + //OR within the score range of each mate + //OR all transcript if chimeric detection is activated + +// if (P.alignEndsType.in=="EndToEnd") {//check that the alignment is end-to-end +// uint rTotal=trA.rLength+trA.lIns; +// // for (uint iex=1;iexreadLength[0]+RA->readLength[1])) || (trA.iFrag>=0 && rTotalreadLength[trA.iFrag])) return; +// }; + + uint iTr=0; //transcript insertion/replacement place + + trA.mappedLength=0; + for (uint iex=0;iexmappedLength-nOverlap; + + if (uNew==0 && Score < wTr[iTr]->maxScore) {//new transript is a subset of the old ones + break; + } else if (uOld==0) {//old transcript is a subset of the new one, remove old transcript + Transcript *pTr=wTr[iTr]; + for (uint ii=iTr+1;ii<*nWinTr;ii++) wTr[ii-1]=wTr[ii]; //shift transcripts + (*nWinTr)--; + wTr[*nWinTr]=pTr; + } else if (uOld>0 && (uNew>0 || Score >= wTr[iTr]->maxScore) ) {//check next transcript + iTr++; + }; + + }; + + if (iTr==*nWinTr) {//insert the new transcript + for (iTr=0;iTr<*nWinTr;iTr++) {//find inseriton location + if (Score>wTr[iTr]->maxScore || (Score==wTr[iTr]->maxScore && trA.gLengthgLength) ) break; + }; + + Transcript *pTr=wTr[*nWinTr]; + for (int ii=*nWinTr; ii> int(iTr); ii--) {//shift all the transcript pointers down from iTr + wTr[ii]=wTr[ii-1]; + }; + wTr[iTr]=pTr; //the new transcript pointer is now at *nWinTr+1, move it into the iTr + *(wTr[iTr])=trA; + if (*nWinTr0) {//stitch, a transcript has already been originated + + dScore=stitchAlignToTranscript(tR2, tG2, WA[iA][WA_rStart], WA[iA][WA_gStart], WA[iA][WA_Length], WA[iA][WA_iFrag], WA[iA][WA_sjA], P, R, mapGen, &trAi, RA->outFilterMismatchNmaxTotal); + //TODO check if the new stitching creates too many MM, quit this transcript if so + + } else { //this is the first align in the transcript + trAi.exons[0][EX_R]=trAi.rStart=WA[iA][WA_rStart]; //transcript start/end + trAi.exons[0][EX_G]=trAi.gStart=WA[iA][WA_gStart]; + trAi.exons[0][EX_L]=WA[iA][WA_Length]; + trAi.exons[0][EX_iFrag]=WA[iA][WA_iFrag]; + trAi.exons[0][EX_sjA]=WA[iA][WA_sjA]; + + trAi.nExons=1; //recorded first exon + + for (uint ii=0;ii-1000000) {//include this align + WAincl[iA]=true; + + if ( WA[iA][WA_Nrep]==1 ) trAi.nUnique++; //unique piece + if ( WA[iA][WA_Anchor]>0 ) trAi.nAnchor++; //anchor piece piece + + stitchWindowAligns(iA+1, nA, Score+dScore, WAincl, WA[iA][WA_rStart]+WA[iA][WA_Length]-1, WA[iA][WA_gStart]+WA[iA][WA_Length]-1, trAi, Lread, WA, R, mapGen, P, wTr, nWinTr, RA); + } else { + + }; + + //also run a transcript w/o including this align + if (WA[iA][WA_Anchor]!=2 || trA.nAnchor>0) {//only allow exclusion if this is not the last anchor, or other anchors have been used + WAincl[iA]=false; + stitchWindowAligns(iA+1, nA, Score, WAincl, tR2, tG2, trA, Lread, WA, R, mapGen, P, wTr, nWinTr, RA); + }; + return; +}; + + diff --git a/star-sys/STAR/source/stitchWindowAligns.h b/star-sys/STAR/source/stitchWindowAligns.h new file mode 100644 index 0000000..fe7af81 --- /dev/null +++ b/star-sys/STAR/source/stitchWindowAligns.h @@ -0,0 +1,12 @@ +#include "IncludeDefine.h" +#include "Parameters.h" +#include "Transcript.h" +#include "extendAlign.h" +#include "stitchAlignToTranscript.h" +#include "ReadAlign.h" + +void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \ + uint Lread, uiWA* WA, char* R, const Genome &mapGen, \ + const Parameters& P, Transcript** wTr, uint* nWinTr, ReadAlign *RA); + //recursively stitch aligns for one gene + //*nWinTr - number of transcripts for the current window diff --git a/star-sys/STAR/source/streamFuns.cpp b/star-sys/STAR/source/streamFuns.cpp new file mode 100644 index 0000000..b970cc8 --- /dev/null +++ b/star-sys/STAR/source/streamFuns.cpp @@ -0,0 +1,108 @@ +#include "streamFuns.h" +#include "ErrorWarning.h" +#include +#include +#include +#include +#define fstream_Chunk_Max 2147483647 + +unsigned long long fstreamReadBig(std::ifstream &S, char* A, unsigned long long N) { + unsigned long long C=0; + for (unsigned long long ii=0; iilogMain << "Writing " << N << " bytes into " < "+ P.pGe.gDir +"/error.info 2>&1").c_str()); +// ifstream error_info((P.pGe.gDir +"/error.info").c_str()); +// P.inOut->logMain <logMain, EXIT_CODE_FILE_WRITE, P); + }; + P.inOut->logMain << " done\n" <logMain, EXIT_CODE_FILE_OPEN, P); + }; + return ofStream; +}; + +std::fstream &fstrOpen (std::string fileName, std::string errorID, const Parameters &P) {//open file 'fileName', generate error if cannot open + std::fstream &fStream = *new std::fstream(fileName.c_str(), std::fstream::in | std::fstream::out | std::fstream::trunc); + if (fStream.fail()) {// + ostringstream errOut; + errOut << errorID<<": exiting because of *OUTPUT FILE* error: could not create input/output file "<< fileName <<"\n"; + errOut << "Solution: check that the path exists and you have write permission for this file\n"; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_FILE_OPEN, P); + }; + return fStream; +}; + +std::ifstream & ifstrOpen (std::string fileName, std::string errorID, std::string solutionString, const Parameters &P) { + //open file 'fileName', generate error if cannot open + std::ifstream & ifStream = *new std::ifstream(fileName.c_str()); + if (ifStream.fail()) {// + ostringstream errOut; + errOut << errorID<<": exiting because of *INPUT FILE* error: could not open input file "<< fileName <<"\n"; + errOut << "Solution: check that the file exists and you have read permission for this file\n"; + if (solutionString.size()>0) { + errOut << " "<< solutionString <<"\n"; + }; + exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_FILE_OPEN, P); + }; + return ifStream; +}; + +ifstream & ifstrOpenGenomeFile (std::string fileName, std::string errorID, const Parameters &P) { + //open one of the genome files + return ifstrOpen(P.pGe.gDir+"/"+fileName, errorID, "if this file is missing from the genome directory, you will need to *re-generate the genome*", P); +}; + +void copyFile(string fileIn, string fileOut) +{//copy fileIn into FileOut + std::ifstream src(fileIn, std::ios::binary); + std::ofstream dst(fileOut, std::ios::binary); + dst << src.rdbuf(); +}; diff --git a/star-sys/STAR/source/streamFuns.h b/star-sys/STAR/source/streamFuns.h new file mode 100644 index 0000000..947cadc --- /dev/null +++ b/star-sys/STAR/source/streamFuns.h @@ -0,0 +1,16 @@ +#ifndef CODE_streamFuns +#define CODE_streamFuns + +#include "Parameters.h" +#include + +unsigned long long fstreamReadBig(std::ifstream &S, char* A, unsigned long long N); +void fstreamWriteBig(std::ofstream &S, char* A, unsigned long long N, std::string fileName, std::string errorID, const Parameters &P) ; + +fstream &fstrOpen (std::string fileName, std::string errorID, const Parameters &P); +ofstream &ofstrOpen (std::string fileName, std::string errorID, const Parameters &P); +ifstream &ifstrOpen (std::string fileName, std::string errorID, std::string solutionString, const Parameters &P); +ifstream &ifstrOpenGenomeFile (std::string fileName, std::string errorID, const Parameters &P); + +void copyFile(string fileIn, string fileOut); +#endif diff --git a/star-sys/STAR/source/stringSubstituteAll.cpp b/star-sys/STAR/source/stringSubstituteAll.cpp new file mode 100644 index 0000000..f14fb09 --- /dev/null +++ b/star-sys/STAR/source/stringSubstituteAll.cpp @@ -0,0 +1,10 @@ +#include "stringSubstituteAll.h" + +void stringSubstituteAll(std::string& str, const std::string& from, const std::string& to) { + if(from.empty()) return; + size_t start_pos = 0; + while((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx' + }; +}; diff --git a/star-sys/STAR/source/stringSubstituteAll.h b/star-sys/STAR/source/stringSubstituteAll.h new file mode 100644 index 0000000..bf86023 --- /dev/null +++ b/star-sys/STAR/source/stringSubstituteAll.h @@ -0,0 +1,8 @@ +#ifndef DEF_stringReplaceAll +#define DEF_stringReplaceAll + +#include + +void stringSubstituteAll(std::string& str, const std::string& from, const std::string& to); + +#endif diff --git a/star-sys/STAR/source/sysRemoveDir.cpp b/star-sys/STAR/source/sysRemoveDir.cpp new file mode 100644 index 0000000..39cb471 --- /dev/null +++ b/star-sys/STAR/source/sysRemoveDir.cpp @@ -0,0 +1,28 @@ +#include +#include +//#define _XOPEN_SOURCE 500 +#include +#include + +int removeFileOrDir(const char *fpath,const struct stat *sb, int typeflag, struct FTW *ftwbuf) { + + {//to avoid unused variable warning + (void) sb; + (void) ftwbuf; + }; + + if (typeflag==FTW_F) {//file + remove(fpath); + } else if (typeflag==FTW_DP) {//dir + rmdir(fpath); + } else {//something went wrong, stop the removal + return -1; + }; + return 0; +}; + + +void sysRemoveDir(std::string dirName) {//remove directory and all its contents + int nftwFlag=FTW_DEPTH; + nftw(dirName.c_str(), removeFileOrDir, 100, nftwFlag); +}; diff --git a/star-sys/STAR/source/sysRemoveDir.h b/star-sys/STAR/source/sysRemoveDir.h new file mode 100644 index 0000000..d254b4e --- /dev/null +++ b/star-sys/STAR/source/sysRemoveDir.h @@ -0,0 +1,8 @@ +#ifndef DEF_sysRemoveDir +#define DEF_sysRemoveDir + +#include + +void sysRemoveDir(std::string dirName); + +#endif \ No newline at end of file