Skip to content

Commit 66ca010

Browse files
authored
Merge pull request #872 from paolopavan/#855-make-or-read-genbank-file-error-when-feature-spans-zero-point-of-circular-sequence
#855 make or read genbank file error when feature spans zero point of circular sequence
2 parents 4b6d6d8 + d0f526b commit 66ca010

File tree

6 files changed

+380
-49
lines changed

6 files changed

+380
-49
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
6767
private GenericGenbankHeaderParser<S, C> headerParser;
6868
private String header;
6969
private String accession;
70+
private boolean isCircularSequence;
71+
private long sequenceLength;
7072
public LinkedHashMap<String, ArrayList<DBReferenceInfo>> mapDB;
7173
/**
7274
* this data structure collects list of features extracted from the
@@ -109,7 +111,7 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
109111
protected static final String START_SEQUENCE_TAG = "ORIGIN";
110112
protected static final String END_SEQUENCE_TAG = "//";
111113
// locus line
112-
protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
114+
protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
113115
// version line
114116
protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
115117
// reference line
@@ -160,9 +162,9 @@ private String parse(BufferedReader bufferedReader) {
160162
if (m.matches()) {
161163
headerParser.setName(m.group(1));
162164
headerParser.setAccession(m.group(1)); // default if no accession found
163-
164-
String lengthUnits = m.group(2);
165-
String type = m.group(5);
165+
sequenceLength = Long.valueOf(m.group(2));
166+
String lengthUnits = m.group(3);
167+
String type = m.group(6);
166168

167169
if (lengthUnits.equalsIgnoreCase("aa")) {
168170
compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
@@ -178,6 +180,12 @@ private String parse(BufferedReader bufferedReader) {
178180
}
179181
}
180182

183+
if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular");
184+
185+
// configure location parser with needed information
186+
locationParser.setSequenceLength(sequenceLength);
187+
locationParser.setSequenceCircular(isCircularSequence);
188+
181189
log.debug("compound type: {}", compoundType.getClass().getSimpleName());
182190

183191
} else {

biojava-core/src/main/java/org/biojava/nbio/core/sequence/location/InsdcLocations.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,11 @@ public GroupLocation(Location... subLocations) {
147147
this(Arrays.asList(subLocations));
148148
}
149149

150+
public GroupLocation(boolean isCircular, Location... subLocations) {
151+
this(Arrays.asList(subLocations));
152+
setCircular(isCircular);
153+
}
154+
150155
public GroupLocation(Point start, Point end, Strand strand,
151156
boolean circular, Location... subLocations) {
152157
super(start, end, strand, circular, subLocations);

biojava-core/src/main/java/org/biojava/nbio/core/sequence/location/InsdcParser.java

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
*/
5050
public class InsdcParser <S extends AbstractSequence<C>, C extends Compound>{
5151

52+
private boolean isSequenceCircular;
53+
private long sequenceLength;
54+
5255
private final DataSource dataSource;
5356

5457
/**
@@ -80,7 +83,6 @@ public class InsdcParser <S extends AbstractSequence<C>, C extends Compound>{
8083
* Not really sure that they are not declared obsolete but they are still in
8184
* several files.
8285
*/
83-
//protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?([^\\)]+)\\)?");
8486
protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?(.+)\\)?");
8587
/**
8688
* designed to recursively split a location string in tokens. Valid tokens
@@ -126,7 +128,13 @@ public DataSource getDataSource() {
126128
return dataSource;
127129
}
128130

131+
public void setSequenceCircular(boolean sequenceCircular) {
132+
isSequenceCircular = sequenceCircular;
133+
}
129134

135+
public void setSequenceLength(long sequenceLength) {
136+
this.sequenceLength = sequenceLength;
137+
}
130138

131139
/**
132140
* Main method for parsing a location from a String instance
@@ -146,23 +154,24 @@ public Location parse(String locationString) throws ParserException {
146154
l = ll.get(0);
147155
} else {
148156
l = new SimpleLocation(
149-
featureGlobalStart,
150-
featureGlobalEnd,
157+
new SimplePoint(featureGlobalStart),
158+
new SimplePoint(featureGlobalEnd),
151159
Strand.UNDEFINED,
160+
isSequenceCircular,
152161
ll);
153162
}
154163
return l;
155164
}
156165

157-
/**
158-
* Reader based version of the parse methods.
159-
*
160-
* @param reader The source of the data; assumes that end of the reader
161-
* stream is the end of the location string to parse
162-
* @return The parsed location
163-
* @throws IOException Thrown with any reader error
164-
* @throws ParserException Thrown with any error with parsing locations
165-
*/
166+
/**
167+
* Reader based version of the parse methods.
168+
*
169+
* @param reader The source of the data; assumes that end of the reader
170+
* stream is the end of the location string to parse
171+
* @return The parsed location
172+
* @throws IOException Thrown with any reader error
173+
* @throws ParserException Thrown with any error with parsing locations
174+
*/
166175
public List<AbstractLocation> parse(Reader reader) throws IOException, ParserException {
167176
// use parse(String s) instead!
168177
return null;
@@ -186,7 +195,8 @@ private List<Location> parseLocationString(String string, int versus) throws Par
186195
if (!splitQualifier.isEmpty()) {
187196
//recursive case
188197
int localVersus = splitQualifier.equalsIgnoreCase("complement") ? -1 : 1;
189-
List<Location> subLocations = parseLocationString(splitString, versus * localVersus);
198+
List<Location> subLocations = parseLocationString(
199+
splitString, versus * localVersus);
190200

191201
switch (complexFeaturesAppendMode) {
192202
case FLATTEN:
@@ -228,8 +238,8 @@ private List<Location> parseLocationString(String string, int versus) throws Par
228238

229239
String accession = m.group(1);
230240
Strand s = versus == 1 ? Strand.POSITIVE : Strand.NEGATIVE;
231-
int start = Integer.parseInt(m.group(3));
232-
int end = m.group(6) == null ? start : new Integer(m.group(6));
241+
int start = Integer.valueOf(m.group(3));
242+
int end = m.group(6) == null ? start : Integer.valueOf(m.group(6));
233243

234244
if (featureGlobalStart > start) {
235245
featureGlobalStart = start;
@@ -238,11 +248,35 @@ private List<Location> parseLocationString(String string, int versus) throws Par
238248
featureGlobalEnd = end;
239249
}
240250

241-
AbstractLocation l = new SimpleLocation(
242-
start,
243-
end,
244-
s
245-
);
251+
AbstractLocation l;
252+
if (start <= end) {
253+
l = new SimpleLocation(
254+
start,
255+
end,
256+
s
257+
);
258+
} else {
259+
// in case of location spanning the end point, Location contract wants sublocations
260+
AbstractLocation l5prime = new SimpleLocation(
261+
1,
262+
end,
263+
Strand.UNDEFINED
264+
);
265+
AbstractLocation l3prime = new SimpleLocation(
266+
start,
267+
(int) sequenceLength,
268+
Strand.UNDEFINED
269+
);
270+
271+
l = new InsdcLocations.GroupLocation(
272+
new SimplePoint(start),
273+
new SimplePoint(end),
274+
s,
275+
isSequenceCircular,
276+
l5prime, l3prime
277+
);
278+
279+
}
246280

247281
if(m.group(4) != null && m.group(4).equals("^")) l.setBetweenCompounds(true);
248282

biojava-core/src/main/java/org/biojava/nbio/core/sequence/location/template/AbstractLocation.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,9 @@ protected void assertLocation() {
128128
int st = getStart().getPosition();
129129
int e = getEnd().getPosition();
130130

131-
if (st > e) {
131+
if (st > e && ! isCircular()) {
132132
throw new IllegalStateException(
133-
String.format("Start (%d) is greater than end (%d); "
133+
String.format("Start (%d) is greater than end (%d) in non circular sequence; "
134134
+ "this is an incorrect format",
135135
st, e));
136136
}

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,41 +22,26 @@
2222

2323
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
2424
import org.biojava.nbio.core.sequence.DNASequence;
25-
import org.biojava.nbio.core.sequence.RNASequence;
2625
import org.biojava.nbio.core.sequence.ProteinSequence;
27-
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
28-
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
29-
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
30-
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
31-
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
26+
import org.biojava.nbio.core.sequence.RNASequence;
27+
import org.biojava.nbio.core.sequence.Strand;
28+
import org.biojava.nbio.core.sequence.compound.*;
3229
import org.biojava.nbio.core.sequence.features.FeatureInterface;
3330
import org.biojava.nbio.core.sequence.features.Qualifier;
31+
import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
3432
import org.biojava.nbio.core.sequence.template.AbstractSequence;
35-
import org.junit.After;
36-
import org.junit.AfterClass;
37-
import org.junit.Assert;
38-
import org.junit.Before;
39-
import org.junit.BeforeClass;
40-
import org.junit.Test;
33+
import org.junit.*;
4134
import org.slf4j.Logger;
4235
import org.slf4j.LoggerFactory;
4336

44-
import java.io.BufferedInputStream;
45-
import java.io.BufferedReader;
46-
import java.io.IOException;
47-
import java.io.InputStream;
48-
import java.io.InputStreamReader;
37+
import java.io.*;
4938
import java.util.ArrayList;
5039
import java.util.LinkedHashMap;
5140
import java.util.List;
5241
import java.util.Map;
5342

5443
import static org.hamcrest.CoreMatchers.is;
55-
import static org.junit.Assert.assertEquals;
56-
import static org.junit.Assert.assertFalse;
57-
import static org.junit.Assert.assertNotNull;
58-
import static org.junit.Assert.assertThat;
59-
import static org.junit.Assert.assertTrue;
44+
import static org.junit.Assert.*;
6045

6146
/**
6247
*
@@ -229,7 +214,7 @@ public void CDStest() throws Exception {
229214

230215
Assert.assertNotNull(codedBy);
231216
Assert.assertTrue(!codedBy.isEmpty());
232-
assertEquals(codedBy, "NM_000266.2:503..904");
217+
assertEquals("NM_000266.2:503..904", codedBy);
233218
assertEquals(5, dbrefs.size());
234219

235220
}
@@ -350,6 +335,22 @@ public void testLegacyLocusCompatable() throws IOException, CompoundNotFoundExce
350335

351336
}
352337

338+
@Test
339+
public void readSequenceWithZeroSpanFeature() throws IOException, CompoundNotFoundException {
340+
logger.info("make or read genbank file error when feature spans zero point of circular sequence (issue #855)");
341+
final DNASequence seq = readGenbankResource("/feature-spans-zero-point-circular-sequence.gb");
342+
343+
assertNotNull(seq);
344+
345+
final FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound> f = seq.getFeatures().get(33);
346+
final AbstractLocation fLocation = f.getLocations();
347+
348+
assertEquals(true, fLocation.isCircular());
349+
assertEquals(7028, (int)fLocation.getStart().getPosition());
350+
assertEquals(286, (int)fLocation.getEnd().getPosition());
351+
assertEquals(Strand.NEGATIVE, fLocation.getStrand());
352+
}
353+
353354
/**
354355
* Helper class to be able to verify the closed state of the input stream.
355356
*/

0 commit comments

Comments
 (0)