gnSEQSource.cpp Source File

00001 
00002 // File:            gnSEQSource.h
00003 // Purpose:         Implements gnBaseSource for .SEQ files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.1.0 
00007 // Author:          Aaron Darling 
00008 // Last Edited:     April 15, 2001, 11:13:00pm 
00009 // Modified by:     
00010 // Copyright:       (c) Aaron Darling 
00011 // Licenses:        Proprietary 
00013 #include "gn/gnFilter.h"
00014 #include "gn/gnFeature.h"
00015 #include "gn/gnSEQSource.h"
00016 #include "gn/gnGBKSource.h"
00017 #include "gn/gnSourceSpec.h"
00018 #include "gn/gnSourceHeader.h"
00019 #include "gn/gnSourceQualifier.h"
00020 #include "gn/gnLocation.h"
00021 #include "gn/gnStringTools.h"
00022 #include "gn/gnDebug.h"
00023 
00024 gnSEQSource::gnSEQSource()
00025 {
00026         m_openString = "";
00027         m_pFilter = gnFilter::fullDNASeqFilter();
00028         if(m_pFilter == NULL){
00029                 DebugMsg("Error using static sequence filters.\n");
00030         }
00031 }
00032 gnSEQSource::gnSEQSource( const gnSEQSource& s ) : gnFileSource(s)
00033 {
00034         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00035         for( ; iter != s.m_contigList.end(); ++iter )
00036         {
00037                 m_contigList.push_back( (*iter)->Clone() );
00038         }
00039 }
00040 gnSEQSource::~gnSEQSource()
00041 {
00042         m_ifstream.close();
00043         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00044         for( ; iter != m_contigList.end(); ++iter )
00045         {
00046                 gnFileContig* fg = *iter;
00047                 *iter = 0;
00048                 delete fg;
00049         }
00050 }
00051 boolean gnSEQSource::HasContig( const string& name ) const
00052 {
00053         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00054         {
00055                 if( name == m_contigList[i]->GetName() )
00056                         return true;
00057         }
00058         return false;
00059 }
00060 uint32 gnSEQSource::GetContigID( const string& name ) const
00061 {
00062         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00063         {
00064                 if( name == m_contigList[i]->GetName() )
00065                         return i;
00066         }
00067         return ALL_CONTIGS;
00068 }
00069 string gnSEQSource::GetContigName( const uint32 i ) const
00070 {
00071         if( i < m_contigList.size() )
00072         {
00073                 return m_contigList[i]->GetName();
00074         }
00075         return "";
00076 }
00077 gnSeqI gnSEQSource::GetContigSeqLength( const uint32 i ) const
00078 {
00079         if( i == ALL_CONTIGS)
00080                 return m_spec->GetLength();
00081         if( i < m_contigList.size() )
00082         {
00083                 return m_contigList[i]->GetSeqLength();
00084         }
00085         return GNSEQI_ERROR;
00086 }
00087 
00088 boolean gnSEQSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ){
00089         uint64 startPos = 0;
00090         uint64 readableBytes = 0;
00091         if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00092         {
00093                 bufLen = 0;
00094                 return false;
00095         }
00096         
00097         if( contigI == ALL_CONTIGS )
00098         {
00099                 uint32 curLen = 0;
00100                 uint64 bytesRead = 0;
00101                 while (curLen < bufLen)
00102                 {
00103 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00104                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00105                                 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106                                         bufLen = curLen;
00107                                         return true;
00108                                 }
00109                         //readLen is the amount to read on this pass
00110                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 
00111                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00112 
00113                         // read chars and filter
00114                         m_ifstream.read(tmpBuf, readLen);
00115                         uint64 gc = m_ifstream.gcount();
00116                         bytesRead += gc;
00117                         readableBytes -= gc;
00118                         for(uint32 i=0; i < gc; i++){
00119                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00120                                         buf[curLen] = tmpBuf[i];
00121                                         curLen++;
00122                                 }
00123                         }
00124                         delete[] tmpBuf;
00125                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00126                                 m_ifstream.clear();
00127                                 bufLen = curLen;
00128                                 return true;
00129                         }
00130                 }
00131                 bufLen = curLen;
00132         }
00133         else if( contigI < m_contigList.size() )
00134         {
00135                 uint32 curLen = 0;
00136                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00137                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00138                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139                 while (curLen < bufLen)
00140                 {
00141                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00142                         gnSeqC* tmpBuf = new gnSeqC[readLen];   //read into tmpBuf, then filter tmpBuf into curBuf
00143 
00144                         // read chars and filter
00145                         m_ifstream.read(tmpBuf, readLen);
00146                         uint64 gc = m_ifstream.gcount();
00147 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00148 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00149                         for(uint32 i=0; i < gc; i++){
00150                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00151                                         buf[curLen] = tmpBuf[i];
00152                                         curLen++;
00153                                 }
00154                         }
00155                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00156                                 m_ifstream.clear();
00157                                 bufLen = curLen;
00158                                 return true;
00159                         }
00160                         delete[] tmpBuf;
00161                 }
00162                 bufLen = curLen;
00163         }
00164         return true;
00165 
00166 }
00167 // private:
00168 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00169 // returns startPos, the file offset where the sequence starts
00170 // returns true if successful, false otherwise
00171 boolean gnSEQSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00172 {
00173         if( contigI == ALL_CONTIGS )
00174         {
00175                 // find first contig
00176                 gnSeqI curIndex = 0;
00177                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00178                 for( ; iter != m_contigList.end(); ++iter )
00179                 {
00180                         uint64 len = (*iter)->GetSeqLength();
00181                         if( (curIndex + len) > start )
00182                                 break;
00183                         curIndex += len;
00184                 }
00185                 if( iter == m_contigList.end() )
00186                         return false;
00187                 // seek to start
00188                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00189                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190         }
00191         else if( contigI < m_contigList.size() )
00192         {
00193                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00194         }
00195         return false;
00196 }
00197 //Returns startPos, the file offset where the sequence starts.
00198 boolean gnSEQSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00199 {
00200         readableBytes = 0;
00201         uint32 curLen = 0;
00202         //seek to the file offset where the contig starts
00203         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00204         m_ifstream.seekg( startPos, ios::beg );
00205         if( m_ifstream.eof() ){
00206                 DebugMsg("ERROR in gnSEQSource::Incorrect contig start position, End of file reached!\n");
00207                 return false;
00208         }
00209         while( true )
00210         {
00211                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00212                   // startPos will contain the file offset with the starting base pair
00213                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00214                 if(tmpbufsize == 0){
00215                         DebugMsg("ERROR in gnSEQSource: stored contig size is incorrect.");
00216                         return false;
00217                 }
00218                 uint64 startOffset = start;
00219                 if(contig.HasRepeatSeqGap()){
00220                         if(contig.GetRepeatSeqGapSize().first > 0){
00221                                 if(contig.GetRepeatSeqGapSize().second > 0){
00222                                         startOffset += (start*contig.GetRepeatSeqGapSize().second)/contig.GetRepeatSeqGapSize().first;
00223                                         startPos+=startOffset;
00224                                         m_ifstream.seekg(startPos , ios::beg);
00225                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00226                                         return true;
00227                                 }
00228                         }else{
00229                                 startPos+=start;
00230                                 m_ifstream.seekg(startPos , ios::beg);
00231                                 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00232                                 return true;
00233                         }
00234                 }
00235                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00236                 char *tmpbuf = new char[tmpbufsize];
00237                 m_ifstream.read( tmpbuf, tmpbufsize );
00238                 if( m_ifstream.eof() ){
00239                         ErrorMsg("ERROR in gnSEQSource::Read End of file reached!\n");
00240                         delete[] tmpbuf;
00241                         return false;
00242                 }
00243                 for( uint32 i=0; i < tmpbufsize; ++i ){
00244                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00245                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00246                                         startPos += i;
00247                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00248                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00249                                         delete[] tmpbuf;
00250                                         return true;
00251                                 }
00252                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00253                         }
00254                 }
00255                 startPos += tmpbufsize;
00256                 delete[] tmpbuf;
00257         }
00258         return true;
00259 }
00260 
00261 
00262 //IMPLEMENT ME!  move these static methods somewhere else!  especially basecount!
00263 void gnSEQSource::BaseCount(const string& bases, gnSeqI& a_count, gnSeqI& c_count, gnSeqI& g_count, gnSeqI& t_count, gnSeqI& other_count){
00264         a_count = 0;
00265         c_count = 0;
00266         g_count = 0;
00267         t_count = 0;
00268         other_count = 0;
00269         for(uint32 i=0; i < bases.length(); i++){
00270                 if((bases[i] == 'a')||(bases[i] == 'A'))
00271                         a_count++;
00272                 else if((bases[i] == 'c')||(bases[i] == 'C'))
00273                         c_count++;
00274                 else if((bases[i] == 'g')||(bases[i] == 'G'))
00275                         g_count++;
00276                 else if((bases[i] == 't')||(bases[i] == 'T'))
00277                         t_count++;
00278                 else
00279                         other_count++;
00280         }
00281 }
00282 
00283 void gnSEQSource::FormatString(string& data, uint32 offset, uint32 width){
00284         //first remove newlines and corresponding whitespace
00285         string::size_type newline_loc = data.find_first_of('\n', 0);
00286         while(newline_loc != string::npos){
00287                 if(data[newline_loc-1] == '\r')
00288                         newline_loc--;
00289                 string::size_type text_loc = newline_loc;
00290                 while((data[text_loc] == ' ') ||(data[text_loc] == '    ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00291                         text_loc++;
00292                         if(text_loc+1 == data.length())
00293                                 break;
00294                 }
00295                 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00296                 newline_loc = data.find_first_of('\n', 0);
00297         }
00298         //now reformat with newlines and whitespace, observing word boundaries...
00299         string output_string = "";
00300         for(uint32 charI = 0; charI < data.length();){
00301                 //get the substring to append and increment charI
00302                 string::size_type base_loc = charI;
00303                 string append_string;
00304                 while(base_loc - charI <= width){
00305                         string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00306                         if(space_loc - charI < width)
00307                                 base_loc = space_loc;
00308                         else if(base_loc == charI){
00309                                 //word is too big for one line.  split it.
00310                                 append_string = data.substr(charI, width);
00311                                 charI+=width;
00312                         }else{
00313                                 append_string = data.substr(charI, base_loc - charI);
00314                                 charI = base_loc;
00315                         }
00316                 }
00317                 output_string += string(offset, ' ') + append_string;
00318                 if(charI + width < data.length())
00319                         output_string += "\r\n";
00320         }
00321         data = output_string;
00322 }
00323 
00324 boolean gnSEQSource::Write(gnGenomeSpec *spec, const string& filename){
00325         ErrorMsg("Writing DNAStar SEQ files is not supported at this time.  Try again next week.\n");
00326         return false;
00327 }
00328 
00329 gnFileContig* gnSEQSource::GetFileContig( const uint32 contigI ) const{
00330         if(m_contigList.size() > contigI)
00331                 return m_contigList[contigI];
00332         return NULL;
00333 }
00334 
00335 //File parsing access routine
00336 boolean gnSEQSource::ParseStream( istream& fin )
00337 {
00338         // INIT temp varables
00339         uint32 readState = 0;
00340         uint32 lineStart = 0;
00341         int64 gapstart = -1;
00342         // INIT buffer
00343         uint32 sectionStart = 0;
00344         uint64 streamPos = 0;
00345         uint64 bufReadLen = 0;
00346         uint64 remainingBuffer = 0;
00347         char* buf = new char[BUFFER_SIZE];
00348         gnFragmentSpec* curFrag = 0;
00349         gnSourceSpec* curSpec = 0;
00350         gnSourceHeader *curHeader;
00351         gnBaseFeature* curFeature;
00352         gnFileContig* curContig = 0;
00353         gnLocation::gnLocationType curBaseLocationType;
00354         gnSeqI curLocationStart;
00355         int32 curStartLength = 0;
00356         int32 curEndLength = 0;
00357         string curLocContig = "";
00358         string curQualifierName;
00359         uint64 curQualifierStart;
00360         string curContigName = "";
00361         gnSeqI seqLength = 0;
00362         gnSeqI lineSeqSize = 0;
00363         
00364         m_spec = new gnGenomeSpec();
00365         while( !fin.eof() )
00366         {
00367                 if(sectionStart > 0){
00368                         if(readState == 15)
00369                                 sectionStart = bufReadLen;
00370                         else if(readState == 16)
00371                                 sectionStart = lineStart;
00372                         remainingBuffer = bufReadLen - sectionStart;
00373                         memmove(buf, buf+sectionStart, remainingBuffer);
00374                 }
00375                   // read chars
00376                 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00377                 streamPos -= remainingBuffer;
00378                 lineStart -= sectionStart;
00379                 if(gapstart > 0)
00380                         gapstart -= sectionStart;
00381                 sectionStart = 0;
00382                 bufReadLen = fin.gcount();
00383                 bufReadLen += remainingBuffer;
00384                 
00385                 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00386                 {
00387                         char ch = buf[i];
00388                         switch( readState )
00389                         {
00390                                 case 0:         //Assume we are in header at the start of a new line.  
00391                                                         //Look for keywords starting in column 1
00392                                         if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != '  ')){  //not equal to space or tab
00393                                                 if(curSpec == NULL){
00394                                                         curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00395                                                         curFrag = new gnFragmentSpec();
00396                                                         curFrag->AddSpec(curSpec);
00397                                                         curSpec->SetSourceName(m_openString);
00398                                                         m_spec->AddSpec(curFrag);
00399                                                 }
00400                                                 if(lineStart != sectionStart){  //Add the previous header to our list
00401                                                         uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00402                                                         for(; j > 0; j--)       
00403                                                                 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != '      '))
00404                                                                         break;
00405                                                         string header_name = string(buf+sectionStart, j+1);
00406                                                         curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00407                                                         //if this is header info _before_ a locus statement then its a general file header.
00408                                                         if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00409                                                                 m_spec->AddHeader(curHeader);
00410                                                         else    //otherwise its a fragment header.
00411                                                                 curFrag->AddHeader(curHeader);
00412                                                         sectionStart = lineStart;
00413                                                 }
00414                                                 
00415                                                 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00416                                                         sectionStart = i + 1;
00417                                                         readState = 1;  //read in features
00418                                                 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00419                                                         curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00420                                                         curFrag->AddHeader(curHeader);
00421                                                         curContig = new gnFileContig();
00422                                                         curContig->SetName(curContigName);
00423                                                         curContigName = "";
00424                                                         readState = 13;  //read in base pairs
00425                                                 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00426                                                         if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00427                                                                 curFrag->SetCircular(true);
00428                                                         uint32 j = SEQ_LOCUS_NAME_LENGTH;
00429                                                         for(; j >= 0; j--)      
00430                                                                 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00431                                                                         break;
00432                                                         curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00433                                                         curFrag->SetName(curContigName);
00434                                                 }else if(strncmp(&buf[lineStart], "^^", 2) == 0){
00435                                                         //start the sequence.
00436                                                         if(curContig == NULL){
00437                                                                 curContig = new gnFileContig();
00438                                                                 curContig->SetName(curContigName);
00439                                                                 curContigName = "";
00440                                                         }
00441                                                         i--;
00442                                                         readState = 14;
00443                                                         break;
00444                                                 }
00445                                         }
00446                                         if(ch == '\n')
00447                                                 lineStart = i + 1;
00448                                         break;
00449                                 case 1: //look for feature tag in column six.  ignore whitespace before feature.
00450                                         if((ch == ' ')||(ch == '        ')){
00451                                                 break;
00452                                         }else if(ch == '\n'){
00453                                                 lineStart = i + 1;
00454                                                 sectionStart = i + 1;
00455                                                 break;
00456                                         }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead
00457                                                 i--;
00458                                                 readState = 0; //Deal with a Header TAG
00459                                                 sectionStart = i + 1;
00460                                                 break;
00461                                         }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='      ')&&(i==lineStart+1))){
00462                                                 sectionStart = i;
00463                                                 readState = 2;
00464                                         } //
00465                                 case 2:  //Get the feature name.  stop on whitespace
00466                                         if((ch == ' ')||(ch == '        ')){
00467                                                 string featureName(buf+sectionStart, i - sectionStart);
00468                                                 curFeature = new gnFeature(featureName);
00469                                                 curFrag->AddFeature(curFeature);
00470                                                 sectionStart = i + 1;
00471                                                 readState = 3;
00472                                         }
00473                                         break;
00474                                 case 3:   //Ignore whitespace before feature location
00475                                         if((ch == ' ')||(ch == '        ')){
00476                                                 break;
00477                                         }else if((ch == '\r')||(ch == '\n')){
00478                                                 lineStart = i+1;
00479                                                 break;
00480                                         }
00481                                         sectionStart = i;
00482                                         readState = 4;
00483                                 case 4:         //Read a location start.  stop on (<.:^ and whitespace
00484                                         if((ch == ' ')||(ch == '        ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00485                                                 string starter(buf+sectionStart, i - sectionStart);
00486                                                 if(ch == '('){
00487                                                         if(starter == "complement")
00488                                                                 curFeature->SetLocationType(gnLocation::LT_Complement);
00489                                                         else if(starter == "order")
00490                                                                 curFeature->SetLocationType(gnLocation::LT_Order);
00491                                                         else if(starter == "group")
00492                                                                 curFeature->SetLocationType(gnLocation::LT_Group);
00493                                                         else if(starter == "one-of")
00494                                                                 curFeature->SetLocationType(gnLocation::LT_OneOf);
00495                                                         sectionStart = i + 1;   //ignore join since it is default.
00496                                                         break;
00497                                                 }else if(ch == ':'){
00498                                                         curLocContig = starter;
00499                                                         sectionStart = i + 1;
00500                                                         break;
00501                                                 }
00502                                                 curLocationStart = atoi(starter.c_str());
00503                                                 readState = 6;  //read in end base by default.
00504                                                 if(ch == '.'){
00505                                                         //go to special state to look for another one.
00506                                                         readState = 5;
00507                                                         sectionStart = i + 1;
00508                                                         break;
00509                                                 }else if(ch == '^'){
00510                                                         curBaseLocationType = gnLocation::LT_BetweenBases;
00511                                                 }else if((ch == ' ')||(ch == '  ')){
00512                                                         //no end location go to qualifier
00513                                                         gnLocation curLocation(curLocationStart, curLocationStart);
00514                                                         curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00515                                                         readState = 7;
00516                                                 }
00517                                                 sectionStart = i + 1;
00518 
00519                                         }else if(ch == '<'){
00520                                                 curStartLength = -1;
00521                                                 sectionStart = i + 1;
00522                                         }else if(ch == '>'){
00523                                                 curStartLength = 1;
00524                                                 sectionStart = i + 1;
00525                                         }
00526                                         break;
00527                                 case 5: //look for another period or location start.
00528                                         if(ch == '.'){
00529                                                 curBaseLocationType = gnLocation::LT_Standard;
00530                                                 readState = 6;
00531                                                 sectionStart = i + 1;
00532                                                 break;
00533                                         }
00534                                         curBaseLocationType = gnLocation::LT_OneOf;
00535                                 case 6: //see if there's a second location value.  stop on >, and whitespace
00536                                         if(ch == '>'){
00537                                                 curEndLength = 1;
00538                                                 sectionStart = i + 1;
00539                                         }else if(ch == '<'){
00540                                                 curEndLength = -1;
00541                                                 sectionStart = i + 1;
00542                                         }else if((ch == ' ')||(ch == '  ')||(ch == ',')){
00543                                                 //read end location
00544                                                 string ender(buf+sectionStart, i - sectionStart);
00545                                                 gnSeqI curLocationEnd = atoi(ender.c_str());
00546                                                 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00547                                                 curEndLength = 0;
00548                                                 curStartLength = 0;
00549                                                 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00550                                                 readState = ch == ',' ? 3 : 7;  //read another loc if we need to.
00551                                                 sectionStart = i+1;
00552                                         }
00553                                         break;
00554                                 case 7:  //skip to start of qualifier
00555                                         if((ch != ' ')&&(ch != '        ')&&(lineStart == i)){
00556                                                 sectionStart = i;       // Hit a tag.  go deal with it.
00557                                                 readState = 0;
00558                                                 i--;
00559                                         }else if((ch != ' ')&&(ch != '  ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]=='  ')&&(i==lineStart+1)))){
00560                                                 sectionStart = i;       // Hit a feature.  go deal with it.
00561                                                 readState = 2;
00562                                                 i--;
00563                                         }else if(ch == ','){  //oops!  another location to read!
00564                                                 sectionStart = i+1;
00565                                                 readState = 3;
00566                                         }else if(ch == '/'){  //finally, a qualifier.
00567                                                 sectionStart = i+1;
00568                                                 readState = 8;
00569                                         }else if(ch == '\n')
00570                                                 lineStart = i + 1;
00571                                         break;
00572                                 case 8:         //get a qualifier, stop on =
00573                                         if(ch == '='){
00574                                                 curQualifierName = string(buf+sectionStart, i - sectionStart);
00575                                                 readState = 9;
00576                                                 sectionStart = i+1;
00577                                         }
00578                                         break;
00579                                 case 9:         //are we getting a string? look for " or [
00580                                         if(ch == '"'){
00581                                                 readState = 10;
00582                                                 sectionStart = i;
00583                                                 curQualifierStart = i + streamPos;
00584                                         }else if(ch == '['){
00585                                                 readState = 11;
00586                                                 sectionStart = i;
00587                                         }else if((ch == '\r')||(ch == '\n')){
00588                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00589                                                 sectionStart = i+1;
00590                                                 readState = 7; //look for another qualifier
00591                                         }
00592                                         break;
00593                                 case 10:                //read until the end of the quotation. look out for escaped quotes
00594                                         if(ch == '"')
00595                                                 readState = 11;
00596                                         if(ch == '\n'){
00597                                                 lineStart = i + 1;
00598                                         }
00599                                         break;
00600                                 case 11:
00601                                         if(ch != '"'){
00602                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00603                                                 sectionStart = i+1;
00604                                                 readState = 7;  //look for another qualifier.
00605                                                 if(ch == '\n')
00606                                                         lineStart = i + 1;
00607                                         }else
00608                                                 readState = 10;  //quote was escaped.  look for another.
00609                                         break;
00610                                 case 12:
00611                                         if(ch == ']'){
00612                                                 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00613                                                 sectionStart = i+1;
00614                                                 readState = 7;  //look for another qualifier.
00615                                         }
00616                                         break;
00617                                 case 13:        //start the sequence read.
00618                                         if(ch == '^')   //stupid blattlab file format.
00619                                                 readState = 14;
00620                                         else{
00621                                                 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos);
00622                                                 readState = 16;
00623                                         }
00624                                         break;
00625                                 case 14:        //wait for newline before sequence starts.
00626                                         if(ch == '\n'){
00627                                                 curContig->SetRepeatSeqGap(true);
00628                                                 lineStart = i + 1;
00629                                                 sectionStart = i + 1;
00630                                                 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos);
00631                                                 readState = 15;
00632                                         }
00633                                         break;
00634                                 case 15:
00635                                         if(m_pFilter->IsValid(ch))
00636                                                 seqLength++;
00637                                         else
00638                                                 curContig->SetRepeatSeqGap(false);
00639                                         break;
00640                                 case 16:
00641                                         if((ch == '/')&&(i==lineStart)){
00642                                                 readState = 17;
00643                                         }else if(m_pFilter->IsValid(ch)){
00644                                                 seqLength++;
00645                                                 lineSeqSize++;
00646                                                 if(gapstart >= 0){
00647                                                         curContig->SetRepeatGapSize(i - gapstart);
00648                                                         gapstart = -1;
00649                                                 }
00650                                         }else if(ch == '\n'){   //IMPLEMENT ME! Needs consistent gap size checking
00651                                                 if(sectionStart == lineStart){
00652                                                         curContig->SetRepeatSeqGap(true);
00653                                                         curContig->SetRepeatSeqSize(seqLength);
00654                                                         gapstart = i;
00655                                                         for(; gapstart >= lineStart; gapstart--)
00656                                                                 if(m_pFilter->IsValid(buf[gapstart]))
00657                                                                         break;
00658                                                         gapstart++;
00659                                                 }else if(lineSeqSize != curContig->GetRepeatSeqGapSize().first)
00660                                                         curContig->SetRepeatSeqGap(false);
00661                                                 lineSeqSize = 0;
00662                                                 lineStart = i + 1;
00663                                         }
00664                                         break;
00665                                 case 17:
00666                                         if((ch == '\n')&&(buf[lineStart+1] == '/')){
00667                                                 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos);
00668                                                 curContig->SetSeqLength(seqLength);
00669                                                 m_contigList.push_back(curContig);
00670                                                 curContig = 0;
00671                                                 curSpec->SetLength(seqLength);
00672                                                 curSpec = 0;
00673                                                 seqLength = 0;
00674                                                 lineStart = i + 1;
00675                                                 sectionStart = i + 1;
00676                                                 readState = 0;
00677                                         }
00678                                         break;
00679                         }
00680                 }
00681                 streamPos += bufReadLen;
00682         }
00683         if(curContig != 0){
00684                 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00685                 curContig->SetSeqLength(seqLength);
00686                 m_contigList.push_back(curContig);
00687                 curSpec->SetLength(seqLength);
00688         }
00689         if(curSpec != NULL)
00690                 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00691                         &&(curSpec->GetLength() == 0)){
00692                         m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00693                         delete curFrag;
00694                 }
00695         m_ifstream.clear();
00696         delete[] buf;
00697         return true;
00698 }