Skip to content

Commit 0809c3f

Browse files
committed
reduced complexity of this monster
1 parent d02ed45 commit 0809c3f

File tree

1 file changed

+175
-158
lines changed

1 file changed

+175
-158
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java

Lines changed: 175 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
106106
protected static final String BASE_COUNT_TAG = "BASE";
107107
// "CONTIG"
108108
protected static final String START_SEQUENCE_TAG = "ORIGIN";
109+
protected static final String DBSOURCE = "DBSOURCE";
110+
protected static final String PRIMARY = "PRIMARY";
111+
protected static final String DBLINK = "DBLINK";
109112
protected static final String END_SEQUENCE_TAG = "//";
110113
// locus line
111114
protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
@@ -130,9 +133,6 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
130133

131134
protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
132135
protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
133-
private static final String DBSOURCE = "DBSOURCE";
134-
private static final String PRIMARY = "PRIMARY";
135-
private static final String DBLINK = "DBLINK";
136136

137137

138138
private String parse(BufferedReader bufferedReader) {
@@ -151,175 +151,192 @@ private String parse(BufferedReader bufferedReader) {
151151
throw new ParserException(Messages.SECTIONKEYNULL);
152152
}
153153
// process section-by-section
154-
if (sectionKey.equals(LOCUS_TAG)) {
155-
String loc = section.get(0)[1];
156-
header = loc;
157-
Matcher m = lp.matcher(loc);
158-
if (m.matches()) {
159-
headerParser.setName(m.group(1));
160-
headerParser.setAccession(m.group(1)); // default if no accession found
161-
sequenceLength = Long.valueOf(m.group(2));
162-
String lengthUnits = m.group(3);
163-
String type = m.group(6);
164-
165-
if (lengthUnits.equalsIgnoreCase("aa")) {
166-
compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
167-
} else if (lengthUnits.equalsIgnoreCase("bp")) {
168-
if (type != null) {
169-
if (type.contains("RNA")) {
170-
compoundType = RNACompoundSet.getRNACompoundSet();
171-
} else {
172-
compoundType = DNACompoundSet.getDNACompoundSet();
173-
}
174-
} else {
175-
compoundType = DNACompoundSet.getDNACompoundSet();
176-
}
154+
switch (sectionKey) {
155+
case LOCUS_TAG: parseLocusTag(section); break;
156+
case DEFINITION_TAG: parseDefinitionTag(section); break;
157+
case ACCESSION_TAG: parseAccessionTag(section); break;
158+
case VERSION_TAG: parseVersionTag(section); break;
159+
case KEYWORDS_TAG: break; // not implemented yet
160+
case SOURCE_TAG: break; // ignore - can get all this from the first feature
161+
case REFERENCE_TAG: parseReferenceTag(section); break;
162+
case COMMENT_TAG: parseCommentTag(section); break;
163+
case FEATURE_TAG: parseFeatureTag(section); break;
164+
case BASE_COUNT_TAG: break; // ignore - can calculate from sequence content later if needed
165+
case START_SEQUENCE_TAG: parseStartSequenceTag(section); break;
166+
case DBSOURCE: break; // not implemented yet
167+
case PRIMARY: break; // not implemented yet
168+
case DBLINK: break; // not implemented yet
169+
default:
170+
if(!sectionKey.equals(END_SEQUENCE_TAG)) {
171+
log.info("found unknown section key: "+sectionKey);
177172
}
173+
}
174+
} while (!sectionKey.equals(END_SEQUENCE_TAG));
175+
return seqData;
176+
}
178177

179-
if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular");
180-
181-
// configure location parser with needed information
182-
locationParser.setSequenceLength(sequenceLength);
183-
locationParser.setSequenceCircular(isCircularSequence);
184-
185-
log.debug("compound type: {}", compoundType.getClass().getSimpleName());
178+
private void parseStartSequenceTag(List<String[]> section) {
179+
// our first line is ignorable as it is the ORIGIN tag
180+
// the second line onwards conveniently have the number as
181+
// the [0] tuple, and sequence string as [1] so all we have
182+
// to do is concat the [1] parts and then strip out spaces,
183+
// and replace '.' and '~' with '-' for our parser.
184+
StringBuffer seq = new StringBuffer();
185+
for (int i = 1; i < section.size(); i++) {
186+
seq.append(section.get(i)[1]);
187+
}
188+
seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
189+
}
186190

187-
} else {
188-
throw new ParserException("Bad locus line");
191+
private void parseFeatureTag(List<String[]> section) {
192+
// starting from second line of input, start a new feature whenever we come across
193+
// a key that does not start with /
194+
AbstractFeature gbFeature = null;
195+
for (int i = 1; i < section.size(); i++) {
196+
String key = section.get(i)[0];
197+
String val = section.get(i)[1];
198+
if (key.startsWith("/")) {
199+
if (gbFeature == null) {
200+
throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
189201
}
190-
} else if (sectionKey.equals(DEFINITION_TAG)) {
191-
headerParser.setDescription(section.get(0)[1]);
192-
} else if (sectionKey.equals(ACCESSION_TAG)) {
193-
// if multiple accessions, store only first as accession,
194-
// and store rest in annotation
195-
String[] accs = section.get(0)[1].split("\\s+");
196-
accession = accs[0].trim();
197-
headerParser.setAccession(accession);
198-
} else if (sectionKey.equals(VERSION_TAG)) {
199-
String ver = section.get(0)[1];
200-
Matcher m = vp.matcher(ver);
201-
if (m.matches()) {
202-
String verAcc = m.group(1);
203-
if (!accession.equals(verAcc)) {
204-
// the version refers to a different accession!
205-
// believe the version line, and store the original
206-
// accession away in the additional accession set
207-
accession = verAcc;
208-
}
209-
if (m.group(3) != null) {
210-
headerParser.setVersion(Integer.parseInt(m.group(3)));
211-
}
212-
if (m.group(5) != null) {
213-
headerParser.setIdentifier(m.group(5));
214-
}
215-
} else {
216-
throw new ParserException("Bad version line");
202+
key = key.substring(1); // strip leading slash
203+
val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
204+
if (val.endsWith("\"")) {
205+
val = val.substring(1, val.length() - 1); // strip quotes
217206
}
218-
} else if (sectionKey.equals(KEYWORDS_TAG)) {
219-
} else if (sectionKey.equals(SOURCE_TAG)) {
220-
// ignore - can get all this from the first feature
221-
} else if (sectionKey.equals(REFERENCE_TAG)) {
222-
GenbankReference genbankReference = new GenbankReference();
223-
for (String[] ref : section) {
224-
if (ref[0].equals(AUTHORS_TAG)) {
225-
genbankReference.setAuthors(ref[1]);
226-
} else if (ref[0].equals(TITLE_TAG)) {
227-
genbankReference.setTitle(ref[1]);
228-
} else if (ref[0].equals(JOURNAL_TAG)) {
229-
genbankReference.setJournal(ref[1]);
207+
// parameter on old feature
208+
if (key.equals("db_xref")) {
209+
Matcher m = dbxp.matcher(val);
210+
if (m.matches()) {
211+
String dbname = m.group(1);
212+
String raccession = m.group(2);
213+
DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession);
214+
gbFeature.addQualifier(key, xref);
215+
216+
ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>();
217+
listDBEntry.add(xref);
218+
mapDB.put(key, listDBEntry);
219+
} else {
220+
throw new ParserException("Bad dbxref");
230221
}
231-
}
232-
headerParser.addReference(genbankReference);
233-
234-
} else if (sectionKey.equals(COMMENT_TAG)) {
235-
// Set up some comments
236-
headerParser.setComment(section.get(0)[1]);
237-
} else if (sectionKey.equals(FEATURE_TAG)) {
238-
// starting from second line of input, start a new feature whenever we come across
239-
// a key that does not start with /
240-
AbstractFeature gbFeature = null;
241-
for (int i = 1; i < section.size(); i++) {
242-
String key = section.get(i)[0];
243-
String val = section.get(i)[1];
244-
if (key.startsWith("/")) {
245-
if (gbFeature == null) {
246-
throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
247-
}
248-
key = key.substring(1); // strip leading slash
249-
val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
250-
if (val.endsWith("\"")) {
251-
val = val.substring(1, val.length() - 1); // strip quotes
252-
}
253-
// parameter on old feature
254-
if (key.equals("db_xref")) {
255-
Matcher m = dbxp.matcher(val);
256-
if (m.matches()) {
257-
String dbname = m.group(1);
258-
String raccession = m.group(2);
259-
DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession);
260-
gbFeature.addQualifier(key, xref);
261-
262-
ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>();
263-
listDBEntry.add(xref);
264-
mapDB.put(key, listDBEntry);
265-
} else {
266-
throw new ParserException("Bad dbxref");
267-
}
268-
} else if (key.equalsIgnoreCase("organism")) {
269-
Qualifier q = new Qualifier(key, val.replace('\n', ' '));
270-
gbFeature.addQualifier(key, q);
271-
} else {
272-
if (key.equalsIgnoreCase("translation")) {
273-
// strip spaces from sequence
274-
val = val.replaceAll("\\s+", "");
275-
Qualifier q = new Qualifier(key, val);
276-
gbFeature.addQualifier(key, q);
277-
} else {
278-
Qualifier q = new Qualifier(key, val);
279-
gbFeature.addQualifier(key, q);
280-
}
281-
}
222+
} else if (key.equalsIgnoreCase("organism")) {
223+
Qualifier q = new Qualifier(key, val.replace('\n', ' '));
224+
gbFeature.addQualifier(key, q);
225+
} else {
226+
if (key.equalsIgnoreCase("translation")) {
227+
// strip spaces from sequence
228+
val = val.replaceAll("\\s+", "");
229+
Qualifier q = new Qualifier(key, val);
230+
gbFeature.addQualifier(key, q);
282231
} else {
283-
// new feature!
284-
gbFeature = new TextFeature(key, val, key, key);
285-
Location l =
286-
locationParser.parse(val);
287-
gbFeature.setLocation((AbstractLocation)l);
288-
289-
if (!featureCollection.containsKey(key)) {
290-
featureCollection.put(key, new ArrayList<>());
291-
}
292-
featureCollection.get(key).add(gbFeature);
232+
Qualifier q = new Qualifier(key, val);
233+
gbFeature.addQualifier(key, q);
293234
}
294235
}
295-
} else if (sectionKey.equals(BASE_COUNT_TAG)) {
296-
// ignore - can calculate from sequence content later if needed
297-
} else if (sectionKey.equals(START_SEQUENCE_TAG)) {
298-
// our first line is ignorable as it is the ORIGIN tag
299-
// the second line onwards conveniently have the number as
300-
// the [0] tuple, and sequence string as [1] so all we have
301-
// to do is concat the [1] parts and then strip out spaces,
302-
// and replace '.' and '~' with '-' for our parser.
303-
StringBuffer seq = new StringBuffer();
304-
for (int i = 1; i < section.size(); i++) {
305-
seq.append(section.get(i)[1]);
306-
}
307-
seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
308-
} else if(sectionKey.equals(DBSOURCE)) {
309-
//TODO
310-
} else if(sectionKey.equals(PRIMARY)) {
311-
//TODO
312-
} else if(sectionKey.equals(DBLINK)) {
313-
//TODO
314236
} else {
315-
if(!sectionKey.equals(END_SEQUENCE_TAG)) {
316-
log.info("found unknown section key: "+sectionKey);
237+
// new feature!
238+
gbFeature = new TextFeature(key, val, key, key);
239+
Location l =
240+
locationParser.parse(val);
241+
gbFeature.setLocation((AbstractLocation)l);
242+
243+
if (!featureCollection.containsKey(key)) {
244+
featureCollection.put(key, new ArrayList<>());
317245
}
246+
featureCollection.get(key).add(gbFeature);
318247
}
319-
} while (!sectionKey.equals(END_SEQUENCE_TAG));
320-
return seqData;
248+
}
321249
}
322250

251+
private void parseCommentTag(List<String[]> section) {
252+
headerParser.setComment(section.get(0)[1]);
253+
}
254+
255+
private void parseReferenceTag(List<String[]> section) {
256+
GenbankReference genbankReference = new GenbankReference();
257+
for (String[] ref : section) {
258+
if (ref[0].equals(AUTHORS_TAG)) {
259+
genbankReference.setAuthors(ref[1]);
260+
} else if (ref[0].equals(TITLE_TAG)) {
261+
genbankReference.setTitle(ref[1]);
262+
} else if (ref[0].equals(JOURNAL_TAG)) {
263+
genbankReference.setJournal(ref[1]);
264+
}
265+
}
266+
headerParser.addReference(genbankReference);
267+
}
268+
269+
private void parseVersionTag(List<String[]> section) {
270+
String ver = section.get(0)[1];
271+
Matcher m = vp.matcher(ver);
272+
if (m.matches()) {
273+
String verAcc = m.group(1);
274+
if (!accession.equals(verAcc)) {
275+
// the version refers to a different accession!
276+
// believe the version line, and store the original
277+
// accession away in the additional accession set
278+
accession = verAcc;
279+
}
280+
if (m.group(3) != null) {
281+
headerParser.setVersion(Integer.parseInt(m.group(3)));
282+
}
283+
if (m.group(5) != null) {
284+
headerParser.setIdentifier(m.group(5));
285+
}
286+
} else {
287+
throw new ParserException("Bad version line");
288+
}
289+
}
290+
291+
private void parseAccessionTag(List<String[]> section) {
292+
// if multiple accessions, store only first as accession,
293+
// and store rest in annotation
294+
String[] accs = section.get(0)[1].split("\\s+");
295+
accession = accs[0].trim();
296+
headerParser.setAccession(accession);
297+
}
298+
299+
private void parseDefinitionTag(List<String[]> section) {
300+
headerParser.setDescription(section.get(0)[1]);
301+
}
302+
303+
private void parseLocusTag(List<String[]> section) {
304+
String loc = section.get(0)[1];
305+
header = loc;
306+
Matcher m = lp.matcher(loc);
307+
if (m.matches()) {
308+
headerParser.setName(m.group(1));
309+
headerParser.setAccession(m.group(1)); // default if no accession found
310+
sequenceLength = Long.valueOf(m.group(2));
311+
String lengthUnits = m.group(3);
312+
String type = m.group(6);
313+
314+
if (lengthUnits.equalsIgnoreCase("aa")) {
315+
compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
316+
} else if (lengthUnits.equalsIgnoreCase("bp")) {
317+
if (type != null) {
318+
if (type.contains("RNA")) {
319+
compoundType = RNACompoundSet.getRNACompoundSet();
320+
} else {
321+
compoundType = DNACompoundSet.getDNACompoundSet();
322+
}
323+
} else {
324+
compoundType = DNACompoundSet.getDNACompoundSet();
325+
}
326+
}
327+
328+
if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular");
329+
330+
// configure location parser with needed information
331+
locationParser.setSequenceLength(sequenceLength);
332+
locationParser.setSequenceCircular(isCircularSequence);
333+
334+
log.debug("compound type: {}", compoundType.getClass().getSimpleName());
335+
336+
} else {
337+
throw new ParserException("Bad locus line");
338+
}
339+
}
323340

324341

325342
// reads an indented section, combining split lines and creating a list of

0 commit comments

Comments
 (0)