|
gnSEQSource.cppGo to the documentation of this file.00001 00002 // File: gnSEQSource.h 00003 // Purpose: Implements gnBaseSource for .SEQ files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.1.0 00007 // Author: Aaron Darling 00008 // Last Edited: April 15, 2001, 11:13:00pm 00009 // Modified by: 00010 // Copyright: (c) Aaron Darling 00011 // Licenses: Proprietary 00013 #include "gn/gnFilter.h" 00014 #include "gn/gnFeature.h" 00015 #include "gn/gnSEQSource.h" 00016 #include "gn/gnGBKSource.h" 00017 #include "gn/gnSourceSpec.h" 00018 #include "gn/gnSourceHeader.h" 00019 #include "gn/gnSourceQualifier.h" 00020 #include "gn/gnLocation.h" 00021 #include "gn/gnStringTools.h" 00022 #include "gn/gnDebug.h" 00023 00024 gnSEQSource::gnSEQSource() 00025 { 00026 m_openString = ""; 00027 m_pFilter = gnFilter::fullDNASeqFilter(); 00028 if(m_pFilter == NULL){ 00029 DebugMsg("Error using static sequence filters.\n"); 00030 } 00031 } 00032 gnSEQSource::gnSEQSource( const gnSEQSource& s ) : gnFileSource(s) 00033 { 00034 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin(); 00035 for( ; iter != s.m_contigList.end(); ++iter ) 00036 { 00037 m_contigList.push_back( (*iter)->Clone() ); 00038 } 00039 } 00040 gnSEQSource::~gnSEQSource() 00041 { 00042 m_ifstream.close(); 00043 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00044 for( ; iter != m_contigList.end(); ++iter ) 00045 { 00046 gnFileContig* fg = *iter; 00047 *iter = 0; 00048 delete fg; 00049 } 00050 } 00051 boolean gnSEQSource::HasContig( const string& name ) const 00052 { 00053 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00054 { 00055 if( name == m_contigList[i]->GetName() ) 00056 return true; 00057 } 00058 return false; 00059 } 00060 uint32 gnSEQSource::GetContigID( const string& name ) const 00061 { 00062 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00063 { 00064 if( name == m_contigList[i]->GetName() ) 00065 return i; 00066 } 00067 return ALL_CONTIGS; 00068 } 00069 string gnSEQSource::GetContigName( const uint32 i ) const 00070 { 00071 if( i < m_contigList.size() ) 00072 { 00073 return m_contigList[i]->GetName(); 00074 } 00075 return ""; 00076 } 00077 gnSeqI gnSEQSource::GetContigSeqLength( const uint32 i ) const 00078 { 00079 if( i == ALL_CONTIGS) 00080 return m_spec->GetLength(); 00081 if( i < m_contigList.size() ) 00082 { 00083 return m_contigList[i]->GetSeqLength(); 00084 } 00085 return GNSEQI_ERROR; 00086 } 00087 00088 boolean gnSEQSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ){ 00089 uint64 startPos = 0; 00090 uint64 readableBytes = 0; 00091 if( !SeqSeek( start, contigI, startPos, readableBytes ) ) 00092 { 00093 bufLen = 0; 00094 return false; 00095 } 00096 00097 if( contigI == ALL_CONTIGS ) 00098 { 00099 uint32 curLen = 0; 00100 uint64 bytesRead = 0; 00101 while (curLen < bufLen) 00102 { 00103 //SeqSeek to start, Figure out how much can be read before SeqSeeking again. 00104 if(readableBytes <= 0) //Look out for zero length contigs! IMPLEMENT ME 00105 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){ 00106 bufLen = curLen; 00107 return true; 00108 } 00109 //readLen is the amount to read on this pass 00110 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 00111 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00112 00113 // read chars and filter 00114 m_ifstream.read(tmpBuf, readLen); 00115 uint64 gc = m_ifstream.gcount(); 00116 bytesRead += gc; 00117 readableBytes -= gc; 00118 for(uint32 i=0; i < gc; i++){ 00119 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00120 buf[curLen] = tmpBuf[i]; 00121 curLen++; 00122 } 00123 } 00124 delete[] tmpBuf; 00125 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00126 m_ifstream.clear(); 00127 bufLen = curLen; 00128 return true; 00129 } 00130 } 00131 bufLen = curLen; 00132 } 00133 else if( contigI < m_contigList.size() ) 00134 { 00135 uint32 curLen = 0; 00136 //check to see if the buffer is bigger than the contig. if so truncate it. 00137 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength(); 00138 bufLen = bufLen < contigSize ? bufLen : contigSize; 00139 while (curLen < bufLen) 00140 { 00141 uint64 readLen = bufLen - curLen; //the amount to read on this pass 00142 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00143 00144 // read chars and filter 00145 m_ifstream.read(tmpBuf, readLen); 00146 uint64 gc = m_ifstream.gcount(); 00147 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00148 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00149 for(uint32 i=0; i < gc; i++){ 00150 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00151 buf[curLen] = tmpBuf[i]; 00152 curLen++; 00153 } 00154 } 00155 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00156 m_ifstream.clear(); 00157 bufLen = curLen; 00158 return true; 00159 } 00160 delete[] tmpBuf; 00161 } 00162 bufLen = curLen; 00163 } 00164 return true; 00165 00166 } 00167 // private: 00168 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig 00169 // returns startPos, the file offset where the sequence starts 00170 // returns true if successful, false otherwise 00171 boolean gnSEQSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes ) 00172 { 00173 if( contigI == ALL_CONTIGS ) 00174 { 00175 // find first contig 00176 gnSeqI curIndex = 0; 00177 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00178 for( ; iter != m_contigList.end(); ++iter ) 00179 { 00180 uint64 len = (*iter)->GetSeqLength(); 00181 if( (curIndex + len) > start ) 00182 break; 00183 curIndex += len; 00184 } 00185 if( iter == m_contigList.end() ) 00186 return false; 00187 // seek to start 00188 gnSeqI startIndex = start - curIndex; //startIndex is starting pos. within the contig 00189 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes ); 00190 } 00191 else if( contigI < m_contigList.size() ) 00192 { 00193 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes ); 00194 } 00195 return false; 00196 } 00197 //Returns startPos, the file offset where the sequence starts. 00198 boolean gnSEQSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ) 00199 { 00200 readableBytes = 0; 00201 uint32 curLen = 0; 00202 //seek to the file offset where the contig starts 00203 startPos = contig.GetSectStartEnd(gnContigSequence).first; //set startPos to start where the contig starts 00204 m_ifstream.seekg( startPos, ios::beg ); 00205 if( m_ifstream.eof() ){ 00206 DebugMsg("ERROR in gnSEQSource::Incorrect contig start position, End of file reached!\n"); 00207 return false; 00208 } 00209 while( true ) 00210 { 00211 // READ the rest of the contig skipping over invalid characters until we get to the starting base pair. 00212 // startPos will contain the file offset with the starting base pair 00213 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00214 if(tmpbufsize == 0){ 00215 DebugMsg("ERROR in gnSEQSource: stored contig size is incorrect."); 00216 return false; 00217 } 00218 uint64 startOffset = start; 00219 if(contig.HasRepeatSeqGap()){ 00220 if(contig.GetRepeatSeqGapSize().first > 0){ 00221 if(contig.GetRepeatSeqGapSize().second > 0){ 00222 startOffset += (start*contig.GetRepeatSeqGapSize().second)/contig.GetRepeatSeqGapSize().first; 00223 startPos+=startOffset; 00224 m_ifstream.seekg(startPos , ios::beg); 00225 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00226 return true; 00227 } 00228 }else{ 00229 startPos+=start; 00230 m_ifstream.seekg(startPos , ios::beg); 00231 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00232 return true; 00233 } 00234 } 00235 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE; //read in the smaller of the two. 00236 char *tmpbuf = new char[tmpbufsize]; 00237 m_ifstream.read( tmpbuf, tmpbufsize ); 00238 if( m_ifstream.eof() ){ 00239 ErrorMsg("ERROR in gnSEQSource::Read End of file reached!\n"); 00240 delete[] tmpbuf; 00241 return false; 00242 } 00243 for( uint32 i=0; i < tmpbufsize; ++i ){ 00244 if( m_pFilter->IsValid(tmpbuf[i]) ){ 00245 if( curLen >= start ){ //stop when we reach the starting offset within this contig 00246 startPos += i; 00247 m_ifstream.seekg( startPos, ios::beg ); //seek to startPos 00248 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00249 delete[] tmpbuf; 00250 return true; 00251 } 00252 ++curLen; //each time we read a valid b.p., increment the sequence length 00253 } 00254 } 00255 startPos += tmpbufsize; 00256 delete[] tmpbuf; 00257 } 00258 return true; 00259 } 00260 00261 00262 //IMPLEMENT ME! move these static methods somewhere else! especially basecount! 00263 void gnSEQSource::BaseCount(const string& bases, gnSeqI& a_count, gnSeqI& c_count, gnSeqI& g_count, gnSeqI& t_count, gnSeqI& other_count){ 00264 a_count = 0; 00265 c_count = 0; 00266 g_count = 0; 00267 t_count = 0; 00268 other_count = 0; 00269 for(uint32 i=0; i < bases.length(); i++){ 00270 if((bases[i] == 'a')||(bases[i] == 'A')) 00271 a_count++; 00272 else if((bases[i] == 'c')||(bases[i] == 'C')) 00273 c_count++; 00274 else if((bases[i] == 'g')||(bases[i] == 'G')) 00275 g_count++; 00276 else if((bases[i] == 't')||(bases[i] == 'T')) 00277 t_count++; 00278 else 00279 other_count++; 00280 } 00281 } 00282 00283 void gnSEQSource::FormatString(string& data, uint32 offset, uint32 width){ 00284 //first remove newlines and corresponding whitespace 00285 string::size_type newline_loc = data.find_first_of('\n', 0); 00286 while(newline_loc != string::npos){ 00287 if(data[newline_loc-1] == '\r') 00288 newline_loc--; 00289 string::size_type text_loc = newline_loc; 00290 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){ 00291 text_loc++; 00292 if(text_loc+1 == data.length()) 00293 break; 00294 } 00295 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc)); 00296 newline_loc = data.find_first_of('\n', 0); 00297 } 00298 //now reformat with newlines and whitespace, observing word boundaries... 00299 string output_string = ""; 00300 for(uint32 charI = 0; charI < data.length();){ 00301 //get the substring to append and increment charI 00302 string::size_type base_loc = charI; 00303 string append_string; 00304 while(base_loc - charI <= width){ 00305 string::size_type space_loc = data.find_first_of(' ', base_loc+1); 00306 if(space_loc - charI < width) 00307 base_loc = space_loc; 00308 else if(base_loc == charI){ 00309 //word is too big for one line. split it. 00310 append_string = data.substr(charI, width); 00311 charI+=width; 00312 }else{ 00313 append_string = data.substr(charI, base_loc - charI); 00314 charI = base_loc; 00315 } 00316 } 00317 output_string += string(offset, ' ') + append_string; 00318 if(charI + width < data.length()) 00319 output_string += "\r\n"; 00320 } 00321 data = output_string; 00322 } 00323 00324 boolean gnSEQSource::Write(gnGenomeSpec *spec, const string& filename){ 00325 ErrorMsg("Writing DNAStar SEQ files is not supported at this time. Try again next week.\n"); 00326 return false; 00327 } 00328 00329 gnFileContig* gnSEQSource::GetFileContig( const uint32 contigI ) const{ 00330 if(m_contigList.size() > contigI) 00331 return m_contigList[contigI]; 00332 return NULL; 00333 } 00334 00335 //File parsing access routine 00336 boolean gnSEQSource::ParseStream( istream& fin ) 00337 { 00338 // INIT temp varables 00339 uint32 readState = 0; 00340 uint32 lineStart = 0; 00341 int64 gapstart = -1; 00342 // INIT buffer 00343 uint32 sectionStart = 0; 00344 uint64 streamPos = 0; 00345 uint64 bufReadLen = 0; 00346 uint64 remainingBuffer = 0; 00347 char* buf = new char[BUFFER_SIZE]; 00348 gnFragmentSpec* curFrag = 0; 00349 gnSourceSpec* curSpec = 0; 00350 gnSourceHeader *curHeader; 00351 gnBaseFeature* curFeature; 00352 gnFileContig* curContig = 0; 00353 gnLocation::gnLocationType curBaseLocationType; 00354 gnSeqI curLocationStart; 00355 int32 curStartLength = 0; 00356 int32 curEndLength = 0; 00357 string curLocContig = ""; 00358 string curQualifierName; 00359 uint64 curQualifierStart; 00360 string curContigName = ""; 00361 gnSeqI seqLength = 0; 00362 gnSeqI lineSeqSize = 0; 00363 00364 m_spec = new gnGenomeSpec(); 00365 while( !fin.eof() ) 00366 { 00367 if(sectionStart > 0){ 00368 if(readState == 15) 00369 sectionStart = bufReadLen; 00370 else if(readState == 16) 00371 sectionStart = lineStart; 00372 remainingBuffer = bufReadLen - sectionStart; 00373 memmove(buf, buf+sectionStart, remainingBuffer); 00374 } 00375 // read chars 00376 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer); 00377 streamPos -= remainingBuffer; 00378 lineStart -= sectionStart; 00379 if(gapstart > 0) 00380 gapstart -= sectionStart; 00381 sectionStart = 0; 00382 bufReadLen = fin.gcount(); 00383 bufReadLen += remainingBuffer; 00384 00385 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ ) 00386 { 00387 char ch = buf[i]; 00388 switch( readState ) 00389 { 00390 case 0: //Assume we are in header at the start of a new line. 00391 //Look for keywords starting in column 1 00392 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){ //not equal to space or tab 00393 if(curSpec == NULL){ 00394 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength()); 00395 curFrag = new gnFragmentSpec(); 00396 curFrag->AddSpec(curSpec); 00397 curSpec->SetSourceName(m_openString); 00398 m_spec->AddSpec(curFrag); 00399 } 00400 if(lineStart != sectionStart){ //Add the previous header to our list 00401 uint32 j = SEQ_HEADER_NAME_LENGTH-1; 00402 for(; j > 0; j--) 00403 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' ')) 00404 break; 00405 string header_name = string(buf+sectionStart, j+1); 00406 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart); 00407 //if this is header info _before_ a locus statement then its a general file header. 00408 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0) 00409 m_spec->AddHeader(curHeader); 00410 else //otherwise its a fragment header. 00411 curFrag->AddHeader(curHeader); 00412 sectionStart = lineStart; 00413 } 00414 00415 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){ 00416 sectionStart = i + 1; 00417 readState = 1; //read in features 00418 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){ 00419 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1); 00420 curFrag->AddHeader(curHeader); 00421 curContig = new gnFileContig(); 00422 curContig->SetName(curContigName); 00423 curContigName = ""; 00424 readState = 13; //read in base pairs 00425 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){ 00426 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0) 00427 curFrag->SetCircular(true); 00428 uint32 j = SEQ_LOCUS_NAME_LENGTH; 00429 for(; j >= 0; j--) 00430 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')) 00431 break; 00432 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1); 00433 curFrag->SetName(curContigName); 00434 }else if(strncmp(&buf[lineStart], "^^", 2) == 0){ 00435 //start the sequence. 00436 if(curContig == NULL){ 00437 curContig = new gnFileContig(); 00438 curContig->SetName(curContigName); 00439 curContigName = ""; 00440 } 00441 i--; 00442 readState = 14; 00443 break; 00444 } 00445 } 00446 if(ch == '\n') 00447 lineStart = i + 1; 00448 break; 00449 case 1: //look for feature tag in column six. ignore whitespace before feature. 00450 if((ch == ' ')||(ch == ' ')){ 00451 break; 00452 }else if(ch == '\n'){ 00453 lineStart = i + 1; 00454 sectionStart = i + 1; 00455 break; 00456 }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead 00457 i--; 00458 readState = 0; //Deal with a Header TAG 00459 sectionStart = i + 1; 00460 break; 00461 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){ 00462 sectionStart = i; 00463 readState = 2; 00464 } // 00465 case 2: //Get the feature name. stop on whitespace 00466 if((ch == ' ')||(ch == ' ')){ 00467 string featureName(buf+sectionStart, i - sectionStart); 00468 curFeature = new gnFeature(featureName); 00469 curFrag->AddFeature(curFeature); 00470 sectionStart = i + 1; 00471 readState = 3; 00472 } 00473 break; 00474 case 3: //Ignore whitespace before feature location 00475 if((ch == ' ')||(ch == ' ')){ 00476 break; 00477 }else if((ch == '\r')||(ch == '\n')){ 00478 lineStart = i+1; 00479 break; 00480 } 00481 sectionStart = i; 00482 readState = 4; 00483 case 4: //Read a location start. stop on (<.:^ and whitespace 00484 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){ 00485 string starter(buf+sectionStart, i - sectionStart); 00486 if(ch == '('){ 00487 if(starter == "complement") 00488 curFeature->SetLocationType(gnLocation::LT_Complement); 00489 else if(starter == "order") 00490 curFeature->SetLocationType(gnLocation::LT_Order); 00491 else if(starter == "group") 00492 curFeature->SetLocationType(gnLocation::LT_Group); 00493 else if(starter == "one-of") 00494 curFeature->SetLocationType(gnLocation::LT_OneOf); 00495 sectionStart = i + 1; //ignore join since it is default. 00496 break; 00497 }else if(ch == ':'){ 00498 curLocContig = starter; 00499 sectionStart = i + 1; 00500 break; 00501 } 00502 curLocationStart = atoi(starter.c_str()); 00503 readState = 6; //read in end base by default. 00504 if(ch == '.'){ 00505 //go to special state to look for another one. 00506 readState = 5; 00507 sectionStart = i + 1; 00508 break; 00509 }else if(ch == '^'){ 00510 curBaseLocationType = gnLocation::LT_BetweenBases; 00511 }else if((ch == ' ')||(ch == ' ')){ 00512 //no end location go to qualifier 00513 gnLocation curLocation(curLocationStart, curLocationStart); 00514 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00515 readState = 7; 00516 } 00517 sectionStart = i + 1; 00518 00519 }else if(ch == '<'){ 00520 curStartLength = -1; 00521 sectionStart = i + 1; 00522 }else if(ch == '>'){ 00523 curStartLength = 1; 00524 sectionStart = i + 1; 00525 } 00526 break; 00527 case 5: //look for another period or location start. 00528 if(ch == '.'){ 00529 curBaseLocationType = gnLocation::LT_Standard; 00530 readState = 6; 00531 sectionStart = i + 1; 00532 break; 00533 } 00534 curBaseLocationType = gnLocation::LT_OneOf; 00535 case 6: //see if there's a second location value. stop on >, and whitespace 00536 if(ch == '>'){ 00537 curEndLength = 1; 00538 sectionStart = i + 1; 00539 }else if(ch == '<'){ 00540 curEndLength = -1; 00541 sectionStart = i + 1; 00542 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){ 00543 //read end location 00544 string ender(buf+sectionStart, i - sectionStart); 00545 gnSeqI curLocationEnd = atoi(ender.c_str()); 00546 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType); 00547 curEndLength = 0; 00548 curStartLength = 0; 00549 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00550 readState = ch == ',' ? 3 : 7; //read another loc if we need to. 00551 sectionStart = i+1; 00552 } 00553 break; 00554 case 7: //skip to start of qualifier 00555 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){ 00556 sectionStart = i; // Hit a tag. go deal with it. 00557 readState = 0; 00558 i--; 00559 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){ 00560 sectionStart = i; // Hit a feature. go deal with it. 00561 readState = 2; 00562 i--; 00563 }else if(ch == ','){ //oops! another location to read! 00564 sectionStart = i+1; 00565 readState = 3; 00566 }else if(ch == '/'){ //finally, a qualifier. 00567 sectionStart = i+1; 00568 readState = 8; 00569 }else if(ch == '\n') 00570 lineStart = i + 1; 00571 break; 00572 case 8: //get a qualifier, stop on = 00573 if(ch == '='){ 00574 curQualifierName = string(buf+sectionStart, i - sectionStart); 00575 readState = 9; 00576 sectionStart = i+1; 00577 } 00578 break; 00579 case 9: //are we getting a string? look for " or [ 00580 if(ch == '"'){ 00581 readState = 10; 00582 sectionStart = i; 00583 curQualifierStart = i + streamPos; 00584 }else if(ch == '['){ 00585 readState = 11; 00586 sectionStart = i; 00587 }else if((ch == '\r')||(ch == '\n')){ 00588 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00589 sectionStart = i+1; 00590 readState = 7; //look for another qualifier 00591 } 00592 break; 00593 case 10: //read until the end of the quotation. look out for escaped quotes 00594 if(ch == '"') 00595 readState = 11; 00596 if(ch == '\n'){ 00597 lineStart = i + 1; 00598 } 00599 break; 00600 case 11: 00601 if(ch != '"'){ 00602 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart)); 00603 sectionStart = i+1; 00604 readState = 7; //look for another qualifier. 00605 if(ch == '\n') 00606 lineStart = i + 1; 00607 }else 00608 readState = 10; //quote was escaped. look for another. 00609 break; 00610 case 12: 00611 if(ch == ']'){ 00612 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00613 sectionStart = i+1; 00614 readState = 7; //look for another qualifier. 00615 } 00616 break; 00617 case 13: //start the sequence read. 00618 if(ch == '^') //stupid blattlab file format. 00619 readState = 14; 00620 else{ 00621 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos); 00622 readState = 16; 00623 } 00624 break; 00625 case 14: //wait for newline before sequence starts. 00626 if(ch == '\n'){ 00627 curContig->SetRepeatSeqGap(true); 00628 lineStart = i + 1; 00629 sectionStart = i + 1; 00630 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos); 00631 readState = 15; 00632 } 00633 break; 00634 case 15: 00635 if(m_pFilter->IsValid(ch)) 00636 seqLength++; 00637 else 00638 curContig->SetRepeatSeqGap(false); 00639 break; 00640 case 16: 00641 if((ch == '/')&&(i==lineStart)){ 00642 readState = 17; 00643 }else if(m_pFilter->IsValid(ch)){ 00644 seqLength++; 00645 lineSeqSize++; 00646 if(gapstart >= 0){ 00647 curContig->SetRepeatGapSize(i - gapstart); 00648 gapstart = -1; 00649 } 00650 }else if(ch == '\n'){ //IMPLEMENT ME! Needs consistent gap size checking 00651 if(sectionStart == lineStart){ 00652 curContig->SetRepeatSeqGap(true); 00653 curContig->SetRepeatSeqSize(seqLength); 00654 gapstart = i; 00655 for(; gapstart >= lineStart; gapstart--) 00656 if(m_pFilter->IsValid(buf[gapstart])) 00657 break; 00658 gapstart++; 00659 }else if(lineSeqSize != curContig->GetRepeatSeqGapSize().first) 00660 curContig->SetRepeatSeqGap(false); 00661 lineSeqSize = 0; 00662 lineStart = i + 1; 00663 } 00664 break; 00665 case 17: 00666 if((ch == '\n')&&(buf[lineStart+1] == '/')){ 00667 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos); 00668 curContig->SetSeqLength(seqLength); 00669 m_contigList.push_back(curContig); 00670 curContig = 0; 00671 curSpec->SetLength(seqLength); 00672 curSpec = 0; 00673 seqLength = 0; 00674 lineStart = i + 1; 00675 sectionStart = i + 1; 00676 readState = 0; 00677 } 00678 break; 00679 } 00680 } 00681 streamPos += bufReadLen; 00682 } 00683 if(curContig != 0){ 00684 curContig->SetSectEnd(gnContigSequence, streamPos - 1); 00685 curContig->SetSeqLength(seqLength); 00686 m_contigList.push_back(curContig); 00687 curSpec->SetLength(seqLength); 00688 } 00689 if(curSpec != NULL) 00690 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0) 00691 &&(curSpec->GetLength() == 0)){ 00692 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1); 00693 delete curFrag; 00694 } 00695 m_ifstream.clear(); 00696 delete[] buf; 00697 return true; 00698 } Generated at Fri Nov 30 15:36:51 2001 for libGenome by 1.2.8.1 written by Dimitri van Heesch, © 1997-2001 |