|
gnDNXSource.cppGo to the documentation of this file.00001 00002 // File: gnDNXSource.h 00003 // Purpose: Implements gnBaseSource for .DNX files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.1.0 00007 // Author: Aaron Darling 00008 // Last Edited: April 15, 2001, 11:13:00pm 00009 // Modified by: 00010 // Copyright: (c) Aaron Darling 00011 // Licenses: Proprietary 00013 #include "gn/gnDNXSource.h" 00014 #include "gn/gnSourceSpec.h" 00015 #include "gn/gnStringSpec.h" 00016 #include "gn/gnSourceFactory.h" 00017 #include "gn/gnFASSource.h" 00018 #include "gn/gnGBKSource.h" 00019 #include "gn/gnBaseHeader.h" 00020 #include "gn/gnFilter.h" 00021 #include "gn/gnDebug.h" 00022 #include "gn/gnStringTools.h" 00023 #include <string> 00024 00025 gnDNXSource::gnDNXSource() 00026 { 00027 m_DNXSpec = new gnGenomeSpec(); 00028 m_pFilter = gnFilter::fullDNASeqFilter(); 00029 if(m_pFilter == NULL){ 00030 DebugMsg("Error using static sequence filters."); 00031 } 00032 } 00033 00034 gnDNXSource::gnDNXSource( const gnDNXSource& s ) : gnFileSource(s) 00035 { 00036 if(s.m_DNXSpec != NULL) 00037 m_DNXSpec = s.m_DNXSpec->Clone(); 00038 } 00039 00040 gnDNXSource::~gnDNXSource() 00041 { 00042 m_ifstream.close(); 00043 delete m_DNXSpec; 00044 } 00045 boolean gnDNXSource::HasContig( const string& name ) const 00046 { 00047 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){ 00048 if(m_DNXSpec->GetSpec(contigI)->GetName() == name) 00049 return true; 00050 } 00051 return false; 00052 } 00053 uint32 gnDNXSource::GetContigID( const string& name ) const 00054 { 00055 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){ 00056 if(m_DNXSpec->GetSpec(contigI)->GetName() == name) 00057 return contigI; 00058 } 00059 return ALL_CONTIGS; 00060 } 00061 string gnDNXSource::GetContigName( const uint32 i ) const 00062 { 00063 if(i < m_DNXSpec->GetSpecListLength()){ 00064 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i); 00065 return gnbs->GetName(); 00066 } 00067 return ""; 00068 } 00069 gnSeqI gnDNXSource::GetContigSeqLength( const uint32 i ) const 00070 { 00071 if( i == ALL_CONTIGS){ 00072 return m_DNXSpec->GetLength(); 00073 }else if(i < m_DNXSpec->GetSpecListLength()){ 00074 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i); 00075 return gnbs->GetLength(); 00076 } 00077 return 0; 00078 } 00079 //read raw data from the file 00080 00081 void gnDNXSource::ValidateName(string& name){ 00082 if(name == ""){ //make a random one. 00083 name.resize(4); 00084 srand(time(NULL)); 00085 for(int i=0; i < 4; i++) 00086 name[i] = (rand() % 26) + 64; 00087 } 00088 } 00089 00090 boolean gnDNXSource::Write(gnGenomeSpec* spec, const string& filename){ 00091 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00092 gnSourceFactory* m_sSourceFactory = gnSourceFactory::GetSourceFactory(); 00093 if(!m_ofstream.is_open()) 00094 return false; 00095 for(uint32 i=0; i < spec->GetSpecListLength(); i++){ //each of these will be dnx statements 00096 gnFragmentSpec* curStatementSpec = spec->GetSpec(i); 00097 string sourceName = spec->GetSourceName(); 00098 string statementName = spec->GetName(); 00099 if(!m_sSourceFactory->HasSource(sourceName)){ 00100 ValidateName(statementName); 00101 statementName += ".seq"; 00102 m_ofstream << statementName << "="; 00103 }else 00104 m_ofstream << sourceName << "="; 00105 for(uint32 j=0; j < curStatementSpec->GetSpecListLength(); j++){ //each of these will be the files 00106 //referred to by the dnx statement 00107 gnContigSpec* curSubSpec = curStatementSpec->GetSpec(i); 00108 sourceName = curStatementSpec->GetSourceName(); 00109 string contigName = curStatementSpec->GetName(); 00110 if(!m_sSourceFactory->HasSource(sourceName)){ 00111 ValidateName(contigName); 00112 string writename = contigName+".seq"; 00113 gnSequence gns = *curSubSpec; 00114 gnGBKSource::Write(gns, writename); 00115 m_ofstream << writename; 00116 }else 00117 m_ofstream << sourceName; 00118 if( j + 1 < curStatementSpec->GetSpecListLength()) 00119 m_ofstream << "+"; 00120 } 00121 m_ofstream << ";"; 00122 gnBaseHeader *gpbh = spec->GetHeader(0); 00123 string header = ""; 00124 if(gpbh != NULL){ 00125 header = gpbh->GetHeader(); 00126 //delete everything after the first newline. 00127 uint32 newlinepos = header.find_first_of('\n', 0); 00128 if(newlinepos != string::npos) 00129 header = header.substr(0, newlinepos - 1); 00130 } 00131 m_ofstream << header << "\r\n"; 00132 } 00133 m_ofstream.close(); 00134 return true; 00135 } 00136 00137 gnFileContig* gnDNXSource::GetFileContig( const uint32 contigI ) const{ 00138 return NULL; //returning NULL 00139 } 00140 00141 //reads an inputstream and creates fills the spec vector appropriately 00142 boolean gnDNXSource::ParseStream( istream& fin ) 00143 { 00144 // INIT temp varables 00145 uint32 readState = 0; //10 - currently inside a comment 00146 uint32 sectionStart = 0; 00147 gnFragmentSpec* currentFragSpec = 0; 00148 gnBaseSource *currentSource; 00149 string currentSourceName; 00150 uint32 currentContig = ALL_CONTIGS; 00151 uint32 currentSeqStart = 0; 00152 boolean currentRevComp = false; 00153 // INIT buffer 00154 uint64 bufReadLen = 0; 00155 uint64 remainingBuffer = 0; 00156 char* buf = new char[BUFFER_SIZE]; 00157 string curliteral; 00158 00159 //Get the source factory and add the current dnx path to it. 00160 gnSourceFactory *sourceFactory = gnSourceFactory::GetSourceFactory(); 00161 sourceFactory->AddPath(getPathString(m_openString)); 00162 00163 while( !fin.eof() ) 00164 { 00165 if(sectionStart > 0){ 00166 remainingBuffer = bufReadLen - sectionStart; 00167 if(readState == 5){ //add literal 00168 curliteral += string(buf, sectionStart, remainingBuffer); 00169 remainingBuffer = 0; 00170 sectionStart = bufReadLen; 00171 }else 00172 memmove(buf, buf+sectionStart, remainingBuffer); 00173 } 00174 // read chars 00175 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart)); 00176 sectionStart = 0; 00177 bufReadLen = fin.gcount() + remainingBuffer; 00178 00179 for( uint32 i=0 ; i < bufReadLen ; i++ ) 00180 { 00181 char ch = buf[i]; 00182 switch( readState ) 00183 { 00184 case 0: // Get name of genome 00185 if(ch == '='){ 00186 //genome name is from sectionStart to i 00187 string contigName(buf+sectionStart, i - sectionStart); 00188 currentFragSpec = new gnFragmentSpec(); 00189 currentFragSpec->SetName(contigName); 00190 currentFragSpec->SetSourceName(m_openString); 00191 m_DNXSpec->AddSpec(currentFragSpec); 00192 sectionStart = i+1; 00193 readState = 1; 00194 } 00195 break; 00196 case 1: // Ignore whitespace before filename or literal 00197 if((ch == ' ')||(ch == ' ')) 00198 break; 00199 case 2: // Are we getting a new source file name or a literal? 00200 if(ch == '"'){ //getting a literal 00201 readState = 5; 00202 sectionStart = i+1; 00203 break; 00204 } 00205 readState = 3; 00206 sectionStart = i; 00207 case 3: // Get a new source file name 00208 //stop on >, (, +, and \n 00209 if(ch == '\n' && sectionStart == i -1){ 00210 if(buf[sectionStart]=='\r'){ 00211 sectionStart = i + 1; 00212 break; 00213 } 00214 } 00215 if((ch == '+')||(ch == '>')||(ch == '(')||(ch == '\n')||(ch == ';')){ 00216 //use the entire source file 00217 string seqfile(buf, sectionStart, i - sectionStart); 00218 currentSourceName = seqfile; 00219 currentSource = sourceFactory->AddSource(seqfile, true); 00220 if (currentSource==NULL) 00221 { 00222 delete[] buf; 00223 return false; 00224 } 00225 if((ch == '+')||(ch == '\n')||(ch == ';')){ 00226 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource); 00227 tmp_spec->SetSourceName(seqfile); 00228 currentFragSpec->AddSpec(tmp_spec); 00229 readState = 1; 00230 if(ch == '\n'){ //reached the end of the statement. parse another. 00231 readState = 0; 00232 }else if(ch == ';'){ //hit a comment. 00233 readState = 9; 00234 } 00235 }else if(ch == '>'){ //select a contig to use 00236 readState = 4; 00237 }else if(ch == '('){ // use a specified section of the entire file 00238 readState = 6; 00239 } 00240 sectionStart = i + 1; 00241 } 00242 break; 00243 case 4: // Get a specific contig to use 00244 //stop on (, +, ;, and \n 00245 if((ch == '+')||(ch == '\n')||(ch == ';')||(ch == '(')){ 00246 //use the entire contig 00247 string contigname(buf, sectionStart, i - sectionStart); 00248 currentContig = currentSource->GetContigID(contigname); 00249 if((ch == '+')||(ch == '\n')||(ch == ';')){ 00250 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig); 00251 tmp_spec->SetSourceName(currentSourceName); 00252 currentFragSpec->AddSpec(tmp_spec); 00253 readState = 1; 00254 if(ch == '\n'){ //reached the end of the statement. parse another. 00255 readState = 0; 00256 }else if(ch == ';'){ //hit a comment. 00257 readState = 9; 00258 } 00259 }else if(ch == '('){ //use the specified section 00260 readState = 6; 00261 } 00262 sectionStart = i + 1; 00263 } 00264 break; 00265 case 5: // read in a literal 00266 // stop on " 00267 if(ch == '"'){ 00268 //now create a string spec from sectionStart to i-1 00269 string literal(buf, sectionStart, i - sectionStart); 00270 if(curliteral.length() > 0){ 00271 literal += curliteral; 00272 curliteral = ""; 00273 } 00274 gnStringSpec *gpss = new gnStringSpec(literal, currentFragSpec->GetSpecListLength()); 00275 currentFragSpec->AddSpec(gpss); 00276 } 00277 case 6: // read in a specified section 00278 //stop on , or < or > 00279 if((ch == ',') || (ch == '<') || (ch == '>')){ 00280 string seqstartstring(buf, sectionStart, i - sectionStart); 00281 if(seqstartstring == "lend"){ 00282 currentSeqStart = 0; 00283 }else if (seqstartstring == "rend"){ 00284 currentSeqStart = GNSEQI_END; 00285 }else 00286 currentSeqStart = atoi(seqstartstring.c_str()) - 1; 00287 if(ch == '<') 00288 currentRevComp = true; 00289 sectionStart = i + 1; 00290 readState = 7; 00291 } 00292 break; 00293 case 7: // read in the second half of a specified section 00294 //stop on ) 00295 if(ch == ')'){ 00296 string seqendstring(buf, sectionStart, i - sectionStart); 00297 uint32 currentSeqEnd = GNSEQI_END; 00298 if(seqendstring == "lend"){ 00299 currentSeqEnd = 0; 00300 }else if (seqendstring == "rend"){ 00301 currentSeqEnd = GNSEQI_END; 00302 }else 00303 currentSeqEnd = atoi(seqendstring.c_str()) - 1; 00304 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp); 00305 tmp_spec->SetSourceName(currentSourceName); 00306 currentFragSpec->AddSpec(tmp_spec); 00307 currentRevComp = false; //set it back to its default value. 00308 sectionStart = i + 1; 00309 readState = 8; //look for connective operator 00310 } 00311 break; 00312 case 8: //skip whitespace until a connective or terminating operator is reached. 00313 if(ch == '+'){ 00314 sectionStart = i + 1; 00315 readState = 1; //start over 00316 } 00317 if(ch == '\n'){ 00318 sectionStart = i + 1; 00319 readState = 0; 00320 } 00321 if(ch == ';'){ 00322 sectionStart = i + 1; 00323 readState = 9; 00324 } 00325 break; 00326 case 9: //skip comment until newline. 00327 if(ch == '\n'){ 00328 sectionStart = i + 1; 00329 readState = 0; 00330 } 00331 break; 00332 default: 00333 DebugMsg("ERROR in file\n"); 00334 return false; 00335 break; 00336 } 00337 }// for all buf 00338 }// while !eof 00339 // CLEAN UP 00340 delete[] buf; 00341 return true; 00342 } Generated at Fri Nov 30 15:36:51 2001 for libGenome by 1.2.8.1 written by Dimitri van Heesch, © 1997-2001 |