Skip to content

Commit ab221dc

Browse files
jamesmorrisjamesmorris
authored andcommitted
Removed getLocusName()
1 parent 843c117 commit ab221dc

File tree

4 files changed

+216
-4
lines changed

4 files changed

+216
-4
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenericGenbankHeaderParser.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ public class GenericGenbankHeaderParser<S extends AbstractSequence<C>, C extends
9797
@Override
9898
public void parseHeader(String header, S sequence) {
9999
sequence.setOriginalHeader(header);
100-
sequence.setLocusName(name);
101100
sequence.setAccession(new AccessionID(accession, DataSource.GENBANK, version, identifier));
102101
sequence.setDescription(description);
103102
sequence.setComments(comments);

biojava-core/src/main/java/org/biojava/nbio/core/sequence/location/InsdcParser.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,15 @@ public class InsdcParser {
6666
* complement(location,location...location): consider locations in their
6767
* complement versus
6868
*
69-
* takes in input a comma splitted location string. The split must be done
69+
* takes in input a comma split location string. The split must be done
7070
* for outer level commas group(1) is the qualifier group(2) is the location
7171
* string to getFeatures. In case of complex splits it will contain the
7272
* nested expression
7373
*
7474
* Not really sure that they are not declared obsolete but they are still in
7575
* several files.
7676
*/
77-
protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?(.+)\\)?");
77+
protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?([\\s\\S]+)\\)?");
7878
/**
7979
* designed to recursively split a location string in tokens. Valid tokens
8080
* are those divided by coma that are not inside a bracket. I. e. split on

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankWriterTest.java

Lines changed: 199 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@
2727
import org.biojava.nbio.core.sequence.AccessionID;
2828
import org.biojava.nbio.core.sequence.DNASequence;
2929
import org.biojava.nbio.core.sequence.features.AbstractFeature;
30+
import org.biojava.nbio.core.sequence.features.FeatureInterface;
3031
import org.biojava.nbio.core.sequence.features.Qualifier;
3132
import org.biojava.nbio.core.sequence.features.TextFeature;
3233
import org.biojava.nbio.core.sequence.location.SimpleLocation;
34+
import org.biojava.nbio.core.sequence.location.template.Location;
35+
import org.biojava.nbio.core.sequence.template.AbstractSequence;
3336
import org.biojava.nbio.core.sequence.Strand;
37+
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
3438
import org.junit.Assert;
3539
import org.junit.Test;
3640

@@ -110,7 +114,7 @@ public void testGithub886() throws Exception {
110114
Arrays.asList(seq),
111115
GenbankWriterHelper.LINEAR_DNA);
112116
fragwriter.close();
113-
System.out.println(fragwriter.toString().replaceAll("\r\n", "\n"));
117+
//System.out.println(fragwriter.toString().replaceAll("\r\n", "\n"));
114118

115119
// now read in the file that was created and check that the qualifiers were created correctly
116120
InputStream readerInputStream = new ByteArrayInputStream(fragwriter.toByteArray());
@@ -140,4 +144,198 @@ public void testGithub886() throws Exception {
140144
assertEquals("50%", newQualifiers.get("note7").get(0).getValue());
141145

142146
}
147+
148+
@Test
149+
public void testLocationJoins() throws Exception {
150+
151+
// First read a GenBank file containing location joins
152+
InputStream inStream = GenbankWriterTest.class.getResourceAsStream("/with_joins.gb");
153+
DNASequence sequence = GenbankReaderHelper.readGenbankDNASequence(inStream).values().iterator().next();
154+
155+
// Check the joins are read correctly
156+
List<FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound>> features = sequence.getFeatures();
157+
158+
FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound> join1 = features.get(0);
159+
List<Location> join1SubLocs = join1.getLocations().getSubLocations();
160+
161+
assertEquals("join1, getType()", "CDS", join1.getType());
162+
assertEquals("join1, getLocations().getStrand()", "POSITIVE", join1.getLocations().getStrand().toString());
163+
assertEquals("join1, getLocations().getSubLocations().size()", 8, join1SubLocs.size());
164+
165+
assertEquals("join1, SubLocation 1)", 3356, join1SubLocs.get(0).getStart().getPosition().intValue());
166+
assertEquals("join1, SubLocation 1)", 3356, join1SubLocs.get(0).getEnd().getPosition().intValue());
167+
168+
assertEquals("join1, SubLocation 2)", 3500, join1SubLocs.get(1).getStart().getPosition().intValue());
169+
assertEquals("join1, SubLocation 2)", 3792, join1SubLocs.get(1).getEnd().getPosition().intValue());
170+
171+
assertEquals("join1, SubLocation 3)", 3793, join1SubLocs.get(2).getStart().getPosition().intValue());
172+
assertEquals("join1, SubLocation 3)", 3793, join1SubLocs.get(2).getEnd().getPosition().intValue());
173+
174+
assertEquals("join1, SubLocation 4)", 4185, join1SubLocs.get(3).getStart().getPosition().intValue());
175+
assertEquals("join1, SubLocation 4)", 4228, join1SubLocs.get(3).getEnd().getPosition().intValue());
176+
177+
assertEquals("join1, SubLocation 5)", 4229, join1SubLocs.get(4).getStart().getPosition().intValue());
178+
assertEquals("join1, SubLocation 5)", 4229, join1SubLocs.get(4).getEnd().getPosition().intValue());
179+
180+
assertEquals("join1, SubLocation 6)", 4348, join1SubLocs.get(5).getStart().getPosition().intValue());
181+
assertEquals("join1, SubLocation 6)", 4676, join1SubLocs.get(5).getEnd().getPosition().intValue());
182+
183+
assertEquals("join1, SubLocation 7)", 4677, join1SubLocs.get(6).getStart().getPosition().intValue());
184+
assertEquals("join1, SubLocation 7)", 4677, join1SubLocs.get(6).getEnd().getPosition().intValue());
185+
186+
assertEquals("join1, SubLocation 8)", 4775, join1SubLocs.get(7).getStart().getPosition().intValue());
187+
assertEquals("join1, SubLocation 8)", 5094, join1SubLocs.get(7).getEnd().getPosition().intValue());
188+
189+
//qualifiers
190+
assertEquals("join1, getType()", "Joined feature", join1.getQualifiers().get("standard_name").get(0).getValue());
191+
192+
//Join 2
193+
FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound> join2 = features.get(1);
194+
List<Location> join2SubLocs = join1.getLocations().getSubLocations();
195+
196+
assertEquals("join1, getType()", "CDS", join2.getType());
197+
assertEquals("join1, getLocations().getStrand()", "NEGATIVE", join2.getLocations().getStrand().toString());
198+
assertEquals("join1, getLocations().getSubLocations().size()", 8, join2SubLocs.size());
199+
200+
assertEquals("join2, SubLocation 1)", 3356, join2SubLocs.get(0).getStart().getPosition().intValue());
201+
assertEquals("join2, SubLocation 1)", 3356, join2SubLocs.get(0).getEnd().getPosition().intValue());
202+
203+
assertEquals("join2, SubLocation 2)", 3500, join2SubLocs.get(1).getStart().getPosition().intValue());
204+
assertEquals("join2, SubLocation 2)", 3792, join2SubLocs.get(1).getEnd().getPosition().intValue());
205+
206+
assertEquals("join2, SubLocation 3)", 3793, join2SubLocs.get(2).getStart().getPosition().intValue());
207+
assertEquals("join2, SubLocation 3)", 3793, join2SubLocs.get(2).getEnd().getPosition().intValue());
208+
209+
assertEquals("join2, SubLocation 4)", 4185, join2SubLocs.get(3).getStart().getPosition().intValue());
210+
assertEquals("join2, SubLocation 4)", 4228, join2SubLocs.get(3).getEnd().getPosition().intValue());
211+
212+
assertEquals("join2, SubLocation 5)", 4229, join2SubLocs.get(4).getStart().getPosition().intValue());
213+
assertEquals("join2, SubLocation 5)", 4229, join2SubLocs.get(4).getEnd().getPosition().intValue());
214+
215+
assertEquals("join2, SubLocation 6)", 4348, join2SubLocs.get(5).getStart().getPosition().intValue());
216+
assertEquals("join2, SubLocation 6)", 4676, join2SubLocs.get(5).getEnd().getPosition().intValue());
217+
218+
assertEquals("join2, SubLocation 7)", 4677, join2SubLocs.get(6).getStart().getPosition().intValue());
219+
assertEquals("join2, SubLocation 7)", 4677, join2SubLocs.get(6).getEnd().getPosition().intValue());
220+
221+
assertEquals("join2, SubLocation 8)", 4775, join2SubLocs.get(7).getStart().getPosition().intValue());
222+
assertEquals("join2, SubLocation 8)", 5094, join2SubLocs.get(7).getEnd().getPosition().intValue());
223+
224+
//qualifiers
225+
assertEquals("join1, getType()", "Joined feature on complement", join2.getQualifiers().get("standard_name").get(0).getValue());
226+
227+
// Now write the joins back to a file using the GenbankWriterHelper
228+
ByteArrayOutputStream fragwriter = new ByteArrayOutputStream();
229+
GenbankWriterHelper.writeNucleotideSequenceOriginal(
230+
fragwriter,
231+
Arrays.asList(sequence));
232+
fragwriter.close();
233+
234+
System.out.println(fragwriter.toString().replaceAll("\r\n", "\n"));
235+
236+
// Read the output file and test that no information is lost
237+
InputStream readerInputStream = new ByteArrayInputStream(fragwriter.toByteArray());
238+
DNASequence newSequence = GenbankReaderHelper.readGenbankDNASequence(readerInputStream).values().iterator().next();
239+
240+
List<FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound>> newFeatures = newSequence.getFeatures();
241+
242+
// Check the output matches the original sequence feature
243+
for (int i=0; i < features.size(); i++ ) {
244+
assertEquals("getFeatures(), getType()", features.get(i).getType(), newFeatures.get(i).getType());
245+
assertEquals("getFeatures(), getLocations()", features.get(i).getLocations(), newFeatures.get(i).getLocations());
246+
assertEquals("getFeatures(), getStrand()", features.get(i).getLocations().getStrand(), newFeatures.get(i).getLocations().getStrand());
247+
248+
List<Location> subLocations = features.get(i).getLocations().getSubLocations();
249+
List<Location> newSubLocations = newFeatures.get(i).getLocations().getSubLocations();
250+
assertEquals("getSubLocations()", subLocations.size(), newSubLocations.size());
251+
252+
assertEquals("getSubLocations()", subLocations, newSubLocations);
253+
254+
for (int j=0; j < subLocations.size(); j++ ) {
255+
assertEquals("getSubLocations()", subLocations.get(j).toString(), newSubLocations.get(j).toString());
256+
}
257+
258+
Map<String, List<Qualifier>> qualifiers = features.get(i).getQualifiers();
259+
Map<String, List<Qualifier>> newQualifiers = newFeatures.get(i).getQualifiers();
260+
261+
for (String qualifierType: qualifiers.keySet()) {
262+
assertEquals("getSubLocations()", qualifiers.get(qualifierType).get(0).getValue(), newQualifiers.get(qualifierType).get(0).getValue());
263+
}
264+
265+
}
266+
267+
}
268+
269+
/**
270+
* Going from GenBank file -> DNASequence object -> GenBank file looses information
271+
* https://github.com/biojava/biojava/issues/942
272+
*/
273+
@Test
274+
public void testGithub942() throws Exception {
275+
276+
// Important information is lost when reading and writing a
277+
// GenBank file through GenbankReaderHelper & GenbankWriterHelper
278+
279+
// First read the sample GenBank file from
280+
// https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html using the
281+
// GenbankReaderHelper
282+
InputStream inStream = GenbankWriterTest.class.getResourceAsStream("/NC_000913.gb");
283+
DNASequence sequence = GenbankReaderHelper.readGenbankDNASequence(inStream).values().iterator().next();
284+
285+
// Then write sequence back to a file using the GenbankWriterHelper
286+
ByteArrayOutputStream fragwriter = new ByteArrayOutputStream();
287+
GenbankWriterHelper.writeNucleotideSequenceOriginal(
288+
fragwriter,
289+
Arrays.asList(sequence));
290+
fragwriter.close();
291+
292+
// Test no important information is lost
293+
InputStream readerInputStream = new ByteArrayInputStream(fragwriter.toByteArray());
294+
DNASequence newSequence = GenbankReaderHelper.readGenbankDNASequence(readerInputStream).values().iterator().next();
295+
296+
//System.out.println(fragwriter.toString().replaceAll("\r\n", "\n"));
297+
298+
assertEquals("getOriginalHeader()", sequence.getOriginalHeader(), newSequence.getOriginalHeader());
299+
assertEquals("getLength()", sequence.getLength(), newSequence.getLength());
300+
assertEquals("getAccession().getID()", sequence.getAccession().getID(), newSequence.getAccession().getID());
301+
assertEquals("getAccession().getVersion()", sequence.getAccession().getVersion(), newSequence.getAccession().getVersion());
302+
assertEquals("getDescription()", sequence.getDescription(), newSequence.getDescription());
303+
assertEquals("getSource()", sequence.getSource(), newSequence.getSource());
304+
assertEquals("getDNAType()", sequence.getDNAType(), newSequence.getDNAType());
305+
assertEquals("getTaxonomy()", sequence.getTaxonomy(), newSequence.getTaxonomy());
306+
assertEquals("getReferences()", sequence.getReferences(), newSequence.getReferences());
307+
assertEquals("getComments()", sequence.getComments(), newSequence.getComments());
308+
assertEquals("getNotesList()", sequence.getNotesList(), newSequence.getNotesList());
309+
310+
List<FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound>> features = sequence.getFeatures();
311+
List<FeatureInterface<AbstractSequence<NucleotideCompound>, NucleotideCompound>> newFeatures = newSequence.getFeatures();
312+
313+
//feature locations and qualifiers
314+
for (int i=0; i < features.size(); i++ ) {
315+
assertEquals("getFeatures(), getType()", features.get(i).getType(), newFeatures.get(i).getType());
316+
assertEquals("getFeatures(), getLocations()", features.get(i).getLocations(), newFeatures.get(i).getLocations());
317+
assertEquals("getFeatures(), getStrand()", features.get(i).getLocations().getStrand(), newFeatures.get(i).getLocations().getStrand());
318+
319+
List<Location> subLocations = features.get(i).getLocations().getSubLocations();
320+
List<Location> newSubLocations = newFeatures.get(i).getLocations().getSubLocations();
321+
assertEquals("getSubLocations()", subLocations.size(), newSubLocations.size());
322+
323+
assertEquals("getSubLocations()", subLocations, newSubLocations);
324+
325+
for (int j=0; j < subLocations.size(); j++ ) {
326+
assertEquals("getSubLocations()", subLocations.get(j).toString(), newSubLocations.get(j).toString());
327+
}
328+
329+
Map<String, List<Qualifier>> qualifiers = features.get(i).getQualifiers();
330+
Map<String, List<Qualifier>> newQualifiers = newFeatures.get(i).getQualifiers();
331+
332+
for (String qualifierType: qualifiers.keySet()) {
333+
assertEquals("getSubLocations()", qualifiers.get(qualifierType).get(0).getValue(), newQualifiers.get(qualifierType).get(0).getValue());
334+
}
335+
336+
}
337+
338+
assertEquals("getSequenceAsString()", sequence.getSequenceAsString(), newSequence.getSequenceAsString());
339+
340+
}
143341
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
LOCUS 4 bp DNA circular SYN 26-SEP-2022
2+
DEFINITION .
3+
ACCESSION .
4+
VERSION .
5+
KEYWORDS .
6+
SOURCE .
7+
FEATURES Location/Qualifiers
8+
CDS join(3356,3500..3792,3793,4185..4228,4229,4348..4676,4677,
9+
4775..5094)
10+
/standard_name="Joined feature"
11+
CDS complement(join(3356,3500..3792,3793,4185..4228,4229,4348..4676,4677,4775..5094))
12+
/standard_name="Joined feature on complement"
13+
ORIGIN
14+
1 acgg
15+
//

0 commit comments

Comments
 (0)