|
gnFASSource.cppGo to the documentation of this file.00001 00002 // File: gnFASSource.h 00003 // Purpose: Implements gnBaseSource for .FAS files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.1.0 00007 // Author: Aaron Darling 00008 // Last Edited: April 15, 2001, 11:13:00pm 00009 // Modified by: 00010 // Copyright: (c) Aaron Darling 00011 // Licenses: Proprietary 00013 #include "gn/gnFASSource.h" 00014 #include "gn/gnBaseSpec.h" 00015 #include "gn/gnFilter.h" 00016 #include "gn/gnStringTools.h" 00017 #include "gn/gnSourceSpec.h" 00018 #include "gn/gnSourceHeader.h" 00019 #include "gn/gnDebug.h" 00020 00021 gnFASSource::gnFASSource() 00022 { 00023 m_openString = ""; 00024 m_pFilter = gnFilter::fullDNASeqFilter(); 00025 if(m_pFilter == NULL){ 00026 DebugMsg("Error using static sequence filters."); 00027 } 00028 } 00029 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s) 00030 { 00031 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin(); 00032 for( ; iter != s.m_contigList.end(); ++iter ) 00033 { 00034 m_contigList.push_back( (*iter)->Clone() ); 00035 } 00036 } 00037 gnFASSource::~gnFASSource() 00038 { 00039 m_ifstream.close(); 00040 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00041 for( ; iter != m_contigList.end(); ++iter ) 00042 { 00043 gnFileContig* fg = *iter; 00044 *iter = 0; 00045 delete fg; 00046 } 00047 } 00048 boolean gnFASSource::HasContig( const string& nameStr ) const 00049 { 00050 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00051 { 00052 if( nameStr == m_contigList[i]->GetName() ) 00053 return true; 00054 } 00055 return false; 00056 } 00057 uint32 gnFASSource::GetContigID( const string& name ) const 00058 { 00059 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00060 { 00061 if( name == m_contigList[i]->GetName() ) 00062 return i; 00063 } 00064 return ALL_CONTIGS; 00065 } 00066 string gnFASSource::GetContigName( const uint32 i ) const 00067 { 00068 if( i < m_contigList.size() ){ 00069 return m_contigList[i]->GetName(); 00070 } 00071 return ""; 00072 } 00073 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const 00074 { 00075 if( i < m_contigList.size() ){ 00076 return m_contigList[i]->GetSeqLength(); 00077 }else if( i == ALL_CONTIGS){ 00078 gnSeqI seqlen = 0; 00079 for(uint32 j=0; j < m_contigList.size(); j++) 00080 seqlen += m_contigList[j]->GetSeqLength(); 00081 return seqlen; 00082 } 00083 return GNSEQI_ERROR; 00084 } 00085 gnFileContig* gnFASSource::GetContig( const uint32 i ) const 00086 { 00087 if( i < m_contigList.size() ){ 00088 return m_contigList[i]; 00089 } 00090 return 0; 00091 } 00092 00093 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ) 00094 { 00095 m_ifstream.clear(); 00096 uint32 contigIndex = contigI; 00097 uint64 startPos = 0; 00098 uint64 readableBytes = 0; 00099 if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){ 00100 bufLen = 0; 00101 return false; 00102 } 00103 00104 if( contigI == ALL_CONTIGS ){ 00105 uint32 curLen = 0; 00106 uint64 bytesRead = 0; 00107 while (curLen < bufLen){ 00108 //SeqSeek to start, Figure out how much can be read before SeqSeeking again. 00109 if(readableBytes <= 0) //Look out for zero length contigs! IMPLEMENT ME 00110 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){ 00111 bufLen = curLen; 00112 return true; 00113 } 00114 //readLen is the amount to read on this pass 00115 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 00116 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00117 00118 // read chars and filter 00119 m_ifstream.read(tmpBuf, readLen); 00120 uint64 gc = m_ifstream.gcount(); 00121 bytesRead += gc; 00122 readableBytes -= gc; 00123 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00124 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00125 for(uint32 i=0; i < gc; i++){ 00126 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00127 buf[curLen] = tmpBuf[i]; 00128 curLen++; 00129 } 00130 } 00131 delete[] tmpBuf; 00132 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00133 m_ifstream.clear(); 00134 bufLen = curLen; 00135 return true; 00136 } 00137 } 00138 bufLen = curLen; 00139 } 00140 else if( contigI < m_contigList.size() ) 00141 { 00142 uint32 curLen = 0; 00143 //check to see if the buffer is bigger than the contig. if so truncate it. 00144 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength(); 00145 bufLen = bufLen < contigSize ? bufLen : contigSize; 00146 while (curLen < bufLen) 00147 { 00148 uint64 readLen = bufLen - curLen; //the amount to read on this pass 00149 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00150 00151 // read chars and filter 00152 m_ifstream.read(tmpBuf, readLen); 00153 uint64 gc = m_ifstream.gcount(); 00154 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00155 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00156 for(uint32 i=0; i < gc; i++){ 00157 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00158 buf[curLen] = tmpBuf[i]; 00159 curLen++; 00160 } 00161 } 00162 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00163 m_ifstream.clear(); 00164 bufLen = curLen; 00165 return true; 00166 } 00167 } 00168 bufLen = curLen; 00169 } 00170 return true; 00171 } 00172 00173 00174 // private: 00175 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig 00176 // returns startPos, the file offset where the sequence starts 00177 // returns true if successful, false otherwise 00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes ) 00179 { 00180 if( contigI == ALL_CONTIGS ){ 00181 // find first contig 00182 gnSeqI curIndex = 0; 00183 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00184 for( ; iter != m_contigList.end(); ++iter ) 00185 { 00186 uint64 len = (*iter)->GetSeqLength(); 00187 if( (curIndex + len) > start ) 00188 break; 00189 curIndex += len; 00190 } 00191 if( iter == m_contigList.end() ) 00192 return false; 00193 // seek to start 00194 gnSeqI startIndex = start - curIndex; //startIndex is starting pos. within the contig 00195 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes ); 00196 } 00197 else if( contigI < m_contigList.size() ) 00198 { 00199 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes ); 00200 } 00201 return false; 00202 } 00203 00204 //Returns startPos, the file offset where the sequence starts. 00205 //IMPLEMENT ME! utilize HasRepeatSeqGap to skip slow character checking process (data isn't noisy) 00206 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){ 00207 readableBytes = 0; 00208 uint32 curLen = 0; 00209 //seek to the file offset where the contig starts 00210 startPos = contig.GetSectStartEnd(gnContigSequence).first; //set startPos to start where the contig starts 00211 00212 if(contig.HasRepeatSeqGap()) 00213 if(contig.GetRepeatSeqGapSize().first > 0) 00214 if(contig.GetRepeatSeqGapSize().second > 0){ 00215 //check this 00216 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second; 00217 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00218 m_ifstream.seekg( startPos, ios::beg ); 00219 return true; 00220 } 00221 m_ifstream.seekg( startPos, ios::beg ); 00222 if( m_ifstream.eof() ){ 00223 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n"); 00224 return false; 00225 } 00226 while( true ){ 00227 // READ the rest of the contig skipping over invalid characters until we get to the starting base pair. 00228 // startPos will contain the file offset with the starting base pair 00229 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00230 if(tmpbufsize == 0){ 00231 ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n"); 00232 return false; 00233 } 00234 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE; //read in the smaller of the two. 00235 char *tmpbuf = new char[tmpbufsize]; 00236 m_ifstream.read( tmpbuf, tmpbufsize ); 00237 if( m_ifstream.eof() ){ 00238 ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n"); 00239 delete[] tmpbuf; 00240 return false; 00241 } 00242 for( uint32 i=0; i < tmpbufsize; ++i ){ 00243 if( m_pFilter->IsValid(tmpbuf[i]) ){ 00244 if( curLen >= start ){ //stop when we reach the starting offset within this contig 00245 startPos += i; 00246 m_ifstream.seekg( startPos, ios::beg ); //seek to startPos 00247 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00248 delete[] tmpbuf; 00249 return true; 00250 } 00251 ++curLen; //each time we read a valid b.p., increment the sequence length 00252 } 00253 } 00254 startPos += tmpbufsize; 00255 delete[] tmpbuf; 00256 } 00257 return true; 00258 } 00259 00260 //write out the given source in this file format. 00261 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){ 00262 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00263 if(!m_ofstream.is_open()) 00264 return false; 00265 uint32 contigCount = source->GetContigListLength(); 00266 for(uint32 contigI = 0; contigI < contigCount; contigI++){ 00267 string contigName = source->GetContigName(contigI); 00268 m_ofstream << ">" << contigName << ";\n"; 00269 gnSeqI seqLength = source->GetContigSeqLength(contigI); 00270 while(seqLength > 0){ 00271 gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE; 00272 gnSeqC *bases = new gnSeqC[writeLen+1]; 00273 boolean success = source->SeqRead(0, bases, writeLen, contigI); 00274 if(!success) 00275 return false; 00276 bases[writeLen] = '\0'; 00277 m_ofstream << bases << "\n"; 00278 delete[] bases; 00279 seqLength -= writeLen; 00280 } 00281 } 00282 m_ofstream.close(); 00283 return true; 00284 } 00285 00286 //write out the given sequence in this file format. 00287 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){ 00288 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00289 if(!m_ofstream.is_open()) 00290 Throw_gnEx(FileNotOpened()); 00291 Write(seq, m_ofstream, write_coords, enforce_unique_names); 00292 m_ofstream.close(); 00293 } 00294 00295 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords = true, boolean enforce_unique_names = true){ 00296 vector<string> contigNameList; 00297 gnSeqC *bases = new gnSeqC[BUFFER_SIZE]; 00298 gnGenomeSpec* spec = seq.GetSpec(); 00299 00300 //set the newline type. 00301 string newline = "\r\n"; 00302 gnSeqI readOffset = 1; 00303 00304 for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){ 00305 //write out contig name and header 00306 string contigName = seq.contigName(fragI); 00307 00308 if(enforce_unique_names){ 00309 uint32 name_count = 0; 00310 for(uint32 i=0; i < contigNameList.size(); i++) 00311 if(contigNameList[i] == contigName) 00312 name_count++; 00313 contigNameList.push_back(contigName); 00314 if(name_count > 0) 00315 contigName += "_" + uintToString(name_count); 00316 } 00317 00318 gnFragmentSpec* subSpec = spec->GetSpec(fragI); 00319 gnBaseHeader *gpbh = subSpec->GetHeader(0); 00320 string header = ""; 00321 if(gpbh != NULL){ 00322 header = gpbh->GetHeader(); 00323 //delete everything after the first newline. 00324 uint32 newlinepos = header.find_first_of('\n', 0); 00325 if(newlinepos != string::npos){ 00326 if(header[newlinepos-1] == '\r') 00327 newlinepos--; 00328 header = header.substr(0, newlinepos); 00329 } 00330 } //IMPLEMENT ME!! Does header line need to be < 80 chars? 00331 00332 //write out the sequence 00333 gnSeqI readLength = seq.contigLength(fragI); 00334 m_ostream << ">" << contigName; 00335 if(write_coords) 00336 m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")"; 00337 m_ostream << "; " << header << newline; 00338 00339 gnSeqI linePos = 0; 00340 while(readLength > 0){ //buffer the read/writes 00341 gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH; 00342 gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size; 00343 00344 if(!seq.ToArray(bases, writeLen, readOffset)) 00345 return; 00346 for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){ 00347 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH; 00348 m_ostream << string(bases + curbase, writeout_size) << newline; 00349 } 00350 readLength -= writeLen; 00351 readOffset += writeLen; 00352 } 00353 if(linePos != 0) 00354 m_ostream << newline; 00355 } 00356 00357 delete[] bases; 00358 } 00359 00360 gnGenomeSpec *gnFASSource::GetSpec() const{ 00361 gnGenomeSpec *spec = new gnGenomeSpec(); 00362 for(uint32 i=0; i < m_contigList.size(); i++){ 00363 //create specs for the fragment and contig levels 00364 gnFragmentSpec *fragmentSpec = new gnFragmentSpec(); 00365 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i); 00366 //set up the spec tree structure 00367 spec->AddSpec(fragmentSpec, i); 00368 fragmentSpec->AddSpec(contigSpec); 00369 00370 fragmentSpec->SetName(m_contigList[i]->GetName()); 00371 fragmentSpec->SetSourceName(m_openString); 00372 contigSpec->SetName(m_contigList[i]->GetName()); 00373 contigSpec->SetSourceName(m_openString); 00374 00375 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader); 00376 if(headsect.first != headsect.second){ 00377 gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first); 00378 fragmentSpec->AddHeader(gpsh, 0); 00379 } 00380 } 00381 return spec; 00382 } 00383 00384 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{ 00385 if(m_contigList.size() > contigI) 00386 return m_contigList[contigI]; 00387 return NULL; 00388 } 00389 00390 boolean gnFASSource::ParseStream( istream& fin ) 00391 { 00392 // INIT temp varables 00393 uint32 readState = 0; 00394 gnFileContig* currentContig = 0; 00395 string nameFStr; 00396 uint64 seqLength = 0, gapLength = 0; 00397 // INIT buffer 00398 uint64 streamPos = 0; 00399 uint64 bufReadLen = 0; 00400 char* buf = new char[BUFFER_SIZE]; 00401 boolean paren_hit = false; 00402 uint32 curNewlineSize = 0; 00403 uint32 repeatSeqSize = 80; 00404 00405 //decide what type of newlines we have 00406 00407 fin.getline( buf, BUFFER_SIZE); 00408 fin.seekg(-2, ios::cur); 00409 fin.read( buf, 2); 00410 fin.seekg(0); 00411 if(buf[1] == '\n'){ 00412 if(buf[0] == '\r'){ 00413 m_newlineType = gnNewlineWindows; 00414 curNewlineSize = 2; 00415 }else{ 00416 curNewlineSize = 1; 00417 if(buf[1] == '\r') 00418 m_newlineType = gnNewlineMac; 00419 else 00420 m_newlineType = gnNewlineUnix; 00421 } 00422 } 00423 00424 while( !fin.eof() ) 00425 { 00426 // read chars 00427 fin.read( buf, BUFFER_SIZE); 00428 streamPos += bufReadLen; 00429 bufReadLen = fin.gcount(); 00430 for( uint32 i=0 ; i < bufReadLen ; i++ ) 00431 { 00432 char ch = buf[i]; 00433 switch( readState ) 00434 { 00435 case 0: // CHECK file validity 00436 if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) ) 00437 { 00438 delete[] buf; 00439 return false; // NOT a .fas file 00440 } 00441 readState = 1; 00442 case 1: // BRANCH 00443 if( ch == '>' ) 00444 { 00445 seqLength = 0; gapLength = 0; // reset count 00446 currentContig = new gnFileContig(); 00447 currentContig->SetFileStart( streamPos + i ); 00448 currentContig->SetRepeatSeqGap(true); 00449 currentContig->SetRepeatSeqSize( repeatSeqSize ); 00450 currentContig->SetRepeatGapSize( curNewlineSize ); 00451 readState = 2; 00452 paren_hit = false; 00453 nameFStr = ""; 00454 } 00455 else 00456 ++gapLength; 00457 break; 00458 case 2: // > CONTIG NAME 00459 if( isNewLine(ch) || ch == ';') 00460 { 00461 currentContig->SetName( nameFStr ); 00462 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 ); 00463 if( ch == ';' ) 00464 readState = 3; 00465 else{ 00466 readState = 4; 00467 if(ch == '\r') 00468 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 ); 00469 } 00470 } 00471 else if( ch == '(' ){ 00472 if(isSpace(buf[i-1])){ 00473 //delete the last char in nameFStr 00474 nameFStr = nameFStr.substr(0, nameFStr.length()-1); 00475 } 00476 paren_hit = true; 00477 }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit ) 00478 nameFStr += ch; 00479 break; 00480 case 3: // >... ; REMARK 00481 if( isNewLine(ch) ) 00482 readState = 4; 00483 break; 00484 case 4: // >... ; /n NEWLINE BRANCH 00485 if( ch == '>' ) 00486 { 00487 readState = 3; 00488 } 00489 else if( m_pFilter->IsValid(ch) ) 00490 { 00491 currentContig->SetSectEnd( gnContigHeader, streamPos + i ); 00492 currentContig->SetSectStart( gnContigSequence, streamPos + i); 00493 seqLength = 1; gapLength = 0; 00494 readState = 5; 00495 } 00496 break; 00497 case 5: // SEQUENCE 00498 while(i < bufReadLen){ 00499 ch = buf[i]; 00500 if( m_pFilter->IsValid(ch) ){ 00501 if(gapLength > 0){ 00502 if(seqLength != repeatSeqSize) 00503 currentContig->SetRepeatSeqGap(false); 00504 if(gapLength != curNewlineSize) 00505 //file is corrupt, can't do jumps. 00506 currentContig->SetRepeatSeqGap(false); 00507 currentContig->AddToSeqLength( seqLength ); 00508 seqLength = 0; 00509 gapLength = 0; 00510 } 00511 seqLength++; 00512 }else if( ch == '>'){ 00513 currentContig->AddToSeqLength( seqLength ); 00514 currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1); 00515 currentContig->SetFileEnd( streamPos + i - 1 ); 00516 m_contigList.push_back(currentContig); 00517 readState = 1; 00518 i--; 00519 break; 00520 } 00521 else if( isNewLine(ch) ){ 00522 gapLength++; 00523 } 00524 else{ 00525 // Error cannot do nice jumps 00526 currentContig->SetRepeatSeqGap(false); 00527 } 00528 i++; 00529 } 00530 break; 00531 default: 00532 ErrorMsg("ERROR"); 00533 delete[] buf; 00534 return false; 00535 break; 00536 } 00537 }// for all buf 00538 }// while !eof 00539 // CLEAN UP 00540 if( currentContig != 0 ) 00541 { 00542 if( readState == 2 ) 00543 { 00544 currentContig->SetName( nameFStr ); 00545 } 00546 if( (readState >= 2) && (readState < 5) ) 00547 { 00548 currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen ); 00549 } 00550 else if( readState == 5 ) 00551 { 00552 currentContig->AddToSeqLength( seqLength ); 00553 currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen ); 00554 } 00555 currentContig->SetFileEnd( streamPos + bufReadLen ); 00556 m_contigList.push_back(currentContig); 00557 } 00558 m_ifstream.clear(); 00559 delete[] buf; 00560 return true; 00561 } Generated at Fri Nov 30 15:36:51 2001 for libGenome by 1.2.8.1 written by Dimitri van Heesch, © 1997-2001 |