@@ -106,6 +106,9 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
106106 protected static final String BASE_COUNT_TAG = "BASE" ;
107107 // "CONTIG"
108108 protected static final String START_SEQUENCE_TAG = "ORIGIN" ;
109+ protected static final String DBSOURCE = "DBSOURCE" ;
110+ protected static final String PRIMARY = "PRIMARY" ;
111+ protected static final String DBLINK = "DBLINK" ;
109112 protected static final String END_SEQUENCE_TAG = "//" ;
110113 // locus line
111114 protected static final Pattern lp = Pattern .compile ("^(\\ S+)\\ s+(\\ d+)\\ s+(bp|BP|aa|AA)\\ s{0,4}(([dmsDMS][sS]-)?(\\ S+))?\\ s*(circular|CIRCULAR|linear|LINEAR)?\\ s*(\\ S+)?\\ s*(\\ S+)?$" );
@@ -130,9 +133,6 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
130133
131134 protected static final Pattern readableFiles = Pattern .compile (".*(g[bp]k*$|\\ u002eg[bp].*)" );
132135 protected static final Pattern headerLine = Pattern .compile ("^LOCUS.*" );
133- private static final String DBSOURCE = "DBSOURCE" ;
134- private static final String PRIMARY = "PRIMARY" ;
135- private static final String DBLINK = "DBLINK" ;
136136
137137
138138 private String parse (BufferedReader bufferedReader ) {
@@ -151,175 +151,192 @@ private String parse(BufferedReader bufferedReader) {
151151 throw new ParserException (Messages .SECTIONKEYNULL );
152152 }
153153 // process section-by-section
154- if (sectionKey .equals (LOCUS_TAG )) {
155- String loc = section .get (0 )[1 ];
156- header = loc ;
157- Matcher m = lp .matcher (loc );
158- if (m .matches ()) {
159- headerParser .setName (m .group (1 ));
160- headerParser .setAccession (m .group (1 )); // default if no accession found
161- sequenceLength = Long .valueOf (m .group (2 ));
162- String lengthUnits = m .group (3 );
163- String type = m .group (6 );
164-
165- if (lengthUnits .equalsIgnoreCase ("aa" )) {
166- compoundType = AminoAcidCompoundSet .getAminoAcidCompoundSet ();
167- } else if (lengthUnits .equalsIgnoreCase ("bp" )) {
168- if (type != null ) {
169- if (type .contains ("RNA" )) {
170- compoundType = RNACompoundSet .getRNACompoundSet ();
171- } else {
172- compoundType = DNACompoundSet .getDNACompoundSet ();
173- }
174- } else {
175- compoundType = DNACompoundSet .getDNACompoundSet ();
176- }
154+ switch (sectionKey ) {
155+ case LOCUS_TAG : parseLocusTag (section ); break ;
156+ case DEFINITION_TAG : parseDefinitionTag (section ); break ;
157+ case ACCESSION_TAG : parseAccessionTag (section ); break ;
158+ case VERSION_TAG : parseVersionTag (section ); break ;
159+ case KEYWORDS_TAG : break ; // not implemented yet
160+ case SOURCE_TAG : break ; // ignore - can get all this from the first feature
161+ case REFERENCE_TAG : parseReferenceTag (section ); break ;
162+ case COMMENT_TAG : parseCommentTag (section ); break ;
163+ case FEATURE_TAG : parseFeatureTag (section ); break ;
164+ case BASE_COUNT_TAG : break ; // ignore - can calculate from sequence content later if needed
165+ case START_SEQUENCE_TAG : parseStartSequenceTag (section ); break ;
166+ case DBSOURCE : break ; // not implemented yet
167+ case PRIMARY : break ; // not implemented yet
168+ case DBLINK : break ; // not implemented yet
169+ default :
170+ if (!sectionKey .equals (END_SEQUENCE_TAG )) {
171+ log .info ("found unknown section key: " +sectionKey );
177172 }
173+ }
174+ } while (!sectionKey .equals (END_SEQUENCE_TAG ));
175+ return seqData ;
176+ }
178177
179- if (m .group (7 ) != null ) isCircularSequence = m .group (7 ).equalsIgnoreCase ("circular" );
180-
181- // configure location parser with needed information
182- locationParser .setSequenceLength (sequenceLength );
183- locationParser .setSequenceCircular (isCircularSequence );
184-
185- log .debug ("compound type: {}" , compoundType .getClass ().getSimpleName ());
178+ private void parseStartSequenceTag (List <String []> section ) {
179+ // our first line is ignorable as it is the ORIGIN tag
180+ // the second line onwards conveniently have the number as
181+ // the [0] tuple, and sequence string as [1] so all we have
182+ // to do is concat the [1] parts and then strip out spaces,
183+ // and replace '.' and '~' with '-' for our parser.
184+ StringBuffer seq = new StringBuffer ();
185+ for (int i = 1 ; i < section .size (); i ++) {
186+ seq .append (section .get (i )[1 ]);
187+ }
188+ seqData = seq .toString ().replaceAll ("\\ s+" , "" ).replaceAll ("[\\ .|~]" , "-" ).toUpperCase ();
189+ }
186190
187- } else {
188- throw new ParserException ("Bad locus line" );
191+ private void parseFeatureTag (List <String []> section ) {
192+ // starting from second line of input, start a new feature whenever we come across
193+ // a key that does not start with /
194+ AbstractFeature gbFeature = null ;
195+ for (int i = 1 ; i < section .size (); i ++) {
196+ String key = section .get (i )[0 ];
197+ String val = section .get (i )[1 ];
198+ if (key .startsWith ("/" )) {
199+ if (gbFeature == null ) {
200+ throw new ParserException ("Malformed GenBank file: found a qualifier without feature." );
189201 }
190- } else if (sectionKey .equals (DEFINITION_TAG )) {
191- headerParser .setDescription (section .get (0 )[1 ]);
192- } else if (sectionKey .equals (ACCESSION_TAG )) {
193- // if multiple accessions, store only first as accession,
194- // and store rest in annotation
195- String [] accs = section .get (0 )[1 ].split ("\\ s+" );
196- accession = accs [0 ].trim ();
197- headerParser .setAccession (accession );
198- } else if (sectionKey .equals (VERSION_TAG )) {
199- String ver = section .get (0 )[1 ];
200- Matcher m = vp .matcher (ver );
201- if (m .matches ()) {
202- String verAcc = m .group (1 );
203- if (!accession .equals (verAcc )) {
204- // the version refers to a different accession!
205- // believe the version line, and store the original
206- // accession away in the additional accession set
207- accession = verAcc ;
208- }
209- if (m .group (3 ) != null ) {
210- headerParser .setVersion (Integer .parseInt (m .group (3 )));
211- }
212- if (m .group (5 ) != null ) {
213- headerParser .setIdentifier (m .group (5 ));
214- }
215- } else {
216- throw new ParserException ("Bad version line" );
202+ key = key .substring (1 ); // strip leading slash
203+ val = val .replaceAll ("\\ s*[\\ n\\ r]+\\ s*" , " " ).trim ();
204+ if (val .endsWith ("\" " )) {
205+ val = val .substring (1 , val .length () - 1 ); // strip quotes
217206 }
218- } else if (sectionKey .equals (KEYWORDS_TAG )) {
219- } else if (sectionKey .equals (SOURCE_TAG )) {
220- // ignore - can get all this from the first feature
221- } else if (sectionKey .equals (REFERENCE_TAG )) {
222- GenbankReference genbankReference = new GenbankReference ();
223- for (String [] ref : section ) {
224- if (ref [0 ].equals (AUTHORS_TAG )) {
225- genbankReference .setAuthors (ref [1 ]);
226- } else if (ref [0 ].equals (TITLE_TAG )) {
227- genbankReference .setTitle (ref [1 ]);
228- } else if (ref [0 ].equals (JOURNAL_TAG )) {
229- genbankReference .setJournal (ref [1 ]);
207+ // parameter on old feature
208+ if (key .equals ("db_xref" )) {
209+ Matcher m = dbxp .matcher (val );
210+ if (m .matches ()) {
211+ String dbname = m .group (1 );
212+ String raccession = m .group (2 );
213+ DBReferenceInfo xref = new DBReferenceInfo (dbname , raccession );
214+ gbFeature .addQualifier (key , xref );
215+
216+ ArrayList <DBReferenceInfo > listDBEntry = new ArrayList <>();
217+ listDBEntry .add (xref );
218+ mapDB .put (key , listDBEntry );
219+ } else {
220+ throw new ParserException ("Bad dbxref" );
230221 }
231- }
232- headerParser .addReference (genbankReference );
233-
234- } else if (sectionKey .equals (COMMENT_TAG )) {
235- // Set up some comments
236- headerParser .setComment (section .get (0 )[1 ]);
237- } else if (sectionKey .equals (FEATURE_TAG )) {
238- // starting from second line of input, start a new feature whenever we come across
239- // a key that does not start with /
240- AbstractFeature gbFeature = null ;
241- for (int i = 1 ; i < section .size (); i ++) {
242- String key = section .get (i )[0 ];
243- String val = section .get (i )[1 ];
244- if (key .startsWith ("/" )) {
245- if (gbFeature == null ) {
246- throw new ParserException ("Malformed GenBank file: found a qualifier without feature." );
247- }
248- key = key .substring (1 ); // strip leading slash
249- val = val .replaceAll ("\\ s*[\\ n\\ r]+\\ s*" , " " ).trim ();
250- if (val .endsWith ("\" " )) {
251- val = val .substring (1 , val .length () - 1 ); // strip quotes
252- }
253- // parameter on old feature
254- if (key .equals ("db_xref" )) {
255- Matcher m = dbxp .matcher (val );
256- if (m .matches ()) {
257- String dbname = m .group (1 );
258- String raccession = m .group (2 );
259- DBReferenceInfo xref = new DBReferenceInfo (dbname , raccession );
260- gbFeature .addQualifier (key , xref );
261-
262- ArrayList <DBReferenceInfo > listDBEntry = new ArrayList <>();
263- listDBEntry .add (xref );
264- mapDB .put (key , listDBEntry );
265- } else {
266- throw new ParserException ("Bad dbxref" );
267- }
268- } else if (key .equalsIgnoreCase ("organism" )) {
269- Qualifier q = new Qualifier (key , val .replace ('\n' , ' ' ));
270- gbFeature .addQualifier (key , q );
271- } else {
272- if (key .equalsIgnoreCase ("translation" )) {
273- // strip spaces from sequence
274- val = val .replaceAll ("\\ s+" , "" );
275- Qualifier q = new Qualifier (key , val );
276- gbFeature .addQualifier (key , q );
277- } else {
278- Qualifier q = new Qualifier (key , val );
279- gbFeature .addQualifier (key , q );
280- }
281- }
222+ } else if (key .equalsIgnoreCase ("organism" )) {
223+ Qualifier q = new Qualifier (key , val .replace ('\n' , ' ' ));
224+ gbFeature .addQualifier (key , q );
225+ } else {
226+ if (key .equalsIgnoreCase ("translation" )) {
227+ // strip spaces from sequence
228+ val = val .replaceAll ("\\ s+" , "" );
229+ Qualifier q = new Qualifier (key , val );
230+ gbFeature .addQualifier (key , q );
282231 } else {
283- // new feature!
284- gbFeature = new TextFeature (key , val , key , key );
285- Location l =
286- locationParser .parse (val );
287- gbFeature .setLocation ((AbstractLocation )l );
288-
289- if (!featureCollection .containsKey (key )) {
290- featureCollection .put (key , new ArrayList <>());
291- }
292- featureCollection .get (key ).add (gbFeature );
232+ Qualifier q = new Qualifier (key , val );
233+ gbFeature .addQualifier (key , q );
293234 }
294235 }
295- } else if (sectionKey .equals (BASE_COUNT_TAG )) {
296- // ignore - can calculate from sequence content later if needed
297- } else if (sectionKey .equals (START_SEQUENCE_TAG )) {
298- // our first line is ignorable as it is the ORIGIN tag
299- // the second line onwards conveniently have the number as
300- // the [0] tuple, and sequence string as [1] so all we have
301- // to do is concat the [1] parts and then strip out spaces,
302- // and replace '.' and '~' with '-' for our parser.
303- StringBuffer seq = new StringBuffer ();
304- for (int i = 1 ; i < section .size (); i ++) {
305- seq .append (section .get (i )[1 ]);
306- }
307- seqData = seq .toString ().replaceAll ("\\ s+" , "" ).replaceAll ("[\\ .|~]" , "-" ).toUpperCase ();
308- } else if (sectionKey .equals (DBSOURCE )) {
309- //TODO
310- } else if (sectionKey .equals (PRIMARY )) {
311- //TODO
312- } else if (sectionKey .equals (DBLINK )) {
313- //TODO
314236 } else {
315- if (!sectionKey .equals (END_SEQUENCE_TAG )) {
316- log .info ("found unknown section key: " +sectionKey );
237+ // new feature!
238+ gbFeature = new TextFeature (key , val , key , key );
239+ Location l =
240+ locationParser .parse (val );
241+ gbFeature .setLocation ((AbstractLocation )l );
242+
243+ if (!featureCollection .containsKey (key )) {
244+ featureCollection .put (key , new ArrayList <>());
317245 }
246+ featureCollection .get (key ).add (gbFeature );
318247 }
319- } while (!sectionKey .equals (END_SEQUENCE_TAG ));
320- return seqData ;
248+ }
321249 }
322250
251+ private void parseCommentTag (List <String []> section ) {
252+ headerParser .setComment (section .get (0 )[1 ]);
253+ }
254+
255+ private void parseReferenceTag (List <String []> section ) {
256+ GenbankReference genbankReference = new GenbankReference ();
257+ for (String [] ref : section ) {
258+ if (ref [0 ].equals (AUTHORS_TAG )) {
259+ genbankReference .setAuthors (ref [1 ]);
260+ } else if (ref [0 ].equals (TITLE_TAG )) {
261+ genbankReference .setTitle (ref [1 ]);
262+ } else if (ref [0 ].equals (JOURNAL_TAG )) {
263+ genbankReference .setJournal (ref [1 ]);
264+ }
265+ }
266+ headerParser .addReference (genbankReference );
267+ }
268+
269+ private void parseVersionTag (List <String []> section ) {
270+ String ver = section .get (0 )[1 ];
271+ Matcher m = vp .matcher (ver );
272+ if (m .matches ()) {
273+ String verAcc = m .group (1 );
274+ if (!accession .equals (verAcc )) {
275+ // the version refers to a different accession!
276+ // believe the version line, and store the original
277+ // accession away in the additional accession set
278+ accession = verAcc ;
279+ }
280+ if (m .group (3 ) != null ) {
281+ headerParser .setVersion (Integer .parseInt (m .group (3 )));
282+ }
283+ if (m .group (5 ) != null ) {
284+ headerParser .setIdentifier (m .group (5 ));
285+ }
286+ } else {
287+ throw new ParserException ("Bad version line" );
288+ }
289+ }
290+
291+ private void parseAccessionTag (List <String []> section ) {
292+ // if multiple accessions, store only first as accession,
293+ // and store rest in annotation
294+ String [] accs = section .get (0 )[1 ].split ("\\ s+" );
295+ accession = accs [0 ].trim ();
296+ headerParser .setAccession (accession );
297+ }
298+
299+ private void parseDefinitionTag (List <String []> section ) {
300+ headerParser .setDescription (section .get (0 )[1 ]);
301+ }
302+
303+ private void parseLocusTag (List <String []> section ) {
304+ String loc = section .get (0 )[1 ];
305+ header = loc ;
306+ Matcher m = lp .matcher (loc );
307+ if (m .matches ()) {
308+ headerParser .setName (m .group (1 ));
309+ headerParser .setAccession (m .group (1 )); // default if no accession found
310+ sequenceLength = Long .valueOf (m .group (2 ));
311+ String lengthUnits = m .group (3 );
312+ String type = m .group (6 );
313+
314+ if (lengthUnits .equalsIgnoreCase ("aa" )) {
315+ compoundType = AminoAcidCompoundSet .getAminoAcidCompoundSet ();
316+ } else if (lengthUnits .equalsIgnoreCase ("bp" )) {
317+ if (type != null ) {
318+ if (type .contains ("RNA" )) {
319+ compoundType = RNACompoundSet .getRNACompoundSet ();
320+ } else {
321+ compoundType = DNACompoundSet .getDNACompoundSet ();
322+ }
323+ } else {
324+ compoundType = DNACompoundSet .getDNACompoundSet ();
325+ }
326+ }
327+
328+ if (m .group (7 ) != null ) isCircularSequence = m .group (7 ).equalsIgnoreCase ("circular" );
329+
330+ // configure location parser with needed information
331+ locationParser .setSequenceLength (sequenceLength );
332+ locationParser .setSequenceCircular (isCircularSequence );
333+
334+ log .debug ("compound type: {}" , compoundType .getClass ().getSimpleName ());
335+
336+ } else {
337+ throw new ParserException ("Bad locus line" );
338+ }
339+ }
323340
324341
325342 // reads an indented section, combining split lines and creating a list of
0 commit comments