Skip to content

Commit 323cd9f

Browse files
authored
Merge pull request #829 from lee-mcfaul/improvement-800-genbankreader
Small improvements
2 parents f63ece3 + 4fd4149 commit 323cd9f

File tree

13 files changed

+1115
-78
lines changed

13 files changed

+1115
-78
lines changed

.gitattributes

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
*.sto text
3838
*.tsv text
3939
*.txt text
40-
*.xml text
40+
*.xml text eol=lf #Causing decompression test to fail when line endings in org/biojava/nbio/core/util/build.xml are crlf
4141
*.xsd text
4242
*.yml text
4343

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java

Lines changed: 28 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,8 @@
2626
package org.biojava.nbio.core.sequence.io;
2727

2828
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
29-
import org.biojava.nbio.core.sequence.DNASequence;
3029
import org.biojava.nbio.core.sequence.DataSource;
31-
import org.biojava.nbio.core.sequence.ProteinSequence;
3230
import org.biojava.nbio.core.sequence.TaxonomyID;
33-
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
34-
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
35-
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
36-
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
37-
import org.biojava.nbio.core.sequence.features.AbstractFeature;
3831
import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
3932
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
4033
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
@@ -43,13 +36,20 @@
4336
import org.slf4j.Logger;
4437
import org.slf4j.LoggerFactory;
4538

46-
import java.io.*;
39+
import java.io.BufferedReader;
40+
import java.io.File;
41+
import java.io.FileNotFoundException;
42+
import java.io.FileReader;
43+
import java.io.IOException;
44+
import java.io.InputStream;
45+
import java.io.InputStreamReader;
4746
import java.util.ArrayList;
4847
import java.util.HashMap;
4948
import java.util.LinkedHashMap;
49+
import java.util.List;
5050

5151
/**
52-
* Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the
52+
* Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the
5353
* primary class used to read Genbank files
5454
*
5555
*/
@@ -66,9 +66,9 @@ public boolean isClosed() {
6666
}
6767

6868
/**
69-
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
70-
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
71-
* an inputstream is forced to read all the data so you don't gain anything.
69+
* If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about
70+
* local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in
71+
* an {@link InputStream} is forced to read all the data so you don't gain anything.
7272
* @param is
7373
* @param headerParser
7474
* @param sequenceCreator
@@ -107,18 +107,21 @@ public GenbankReader(
107107

108108
/**
109109
* The parsing is done in this method.<br>
110-
* This method tries to process all the available Genbank records
110+
* This method will return all the available Genbank records
111111
* in the File or InputStream, closes the underlying resource,
112112
* and return the results in {@link LinkedHashMap}.<br>
113-
* You don't need to call {@link #close()} after calling this method.
113+
* You don't need to call {@link GenbankReader#close()} after calling this method.
114114
* @see #process(int)
115115
* @return {@link HashMap} containing all the parsed Genbank records
116116
* present, starting current fileIndex onwards.
117117
* @throws IOException
118118
* @throws CompoundNotFoundException
119+
* @throws OutOfMemoryError if the input resource is larger than the allocated heap.
119120
*/
120121
public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
121-
return process(-1);
122+
LinkedHashMap<String,S> result = process(-1);
123+
close();
124+
return result;
122125
}
123126

124127
/**
@@ -137,13 +140,18 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
137140
* @see #process()
138141
* @author Amr AL-Hossary
139142
* @since 3.0.6
140-
* @param max maximum number of records to return, <code>-1</code> for infinity.
143+
* @param max maximum number of records to return.
141144
* @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
142145
* present, starting current fileIndex onwards.
143146
* @throws IOException
144147
* @throws CompoundNotFoundException
145148
*/
146149
public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
150+
151+
if(closed){
152+
throw new IOException("Cannot perform action: resource has been closed.");
153+
}
154+
147155
LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
148156
@SuppressWarnings("unchecked")
149157
int i=0;
@@ -158,12 +166,9 @@ public LinkedHashMap<String,S> process(final int max) throws IOException, Compou
158166
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
159167

160168
// add features to new sequence
161-
for (String k: genbankParser.getFeatures().keySet()){
162-
for (AbstractFeature f: genbankParser.getFeatures(k)){
163-
//f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so...
164-
sequence.addFeature(f);
165-
}
166-
}
169+
genbankParser.getFeatures().values().stream()
170+
.flatMap(List::stream)
171+
.forEach(sequence::addFeature);
167172

168173
// add taxonomy ID to new sequence
169174
ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
@@ -175,10 +180,6 @@ public LinkedHashMap<String,S> process(final int max) throws IOException, Compou
175180
sequences.put(sequence.getAccession().getID(), sequence);
176181
}
177182

178-
if (max < 0) {
179-
close();
180-
}
181-
182183
return sequences;
183184
}
184185

@@ -187,33 +188,9 @@ public void close() {
187188
bufferedReader.close();
188189
this.closed = true;
189190
} catch (IOException e) {
190-
logger.error("Couldn't close the reader. {}", e.getMessage());
191+
logger.error("Couldn't close the reader.", e);
191192
this.closed = false;
192193
}
193194
}
194-
195-
public static void main(String[] args) throws Exception {
196-
String proteinFile = "src/test/resources/BondFeature.gb";
197-
FileInputStream is = new FileInputStream(proteinFile);
198-
199-
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
200-
LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
201-
System.out.println(proteinSequences);
202-
203-
String inputFile = "src/test/resources/NM_000266.gb";
204-
is = new FileInputStream(inputFile);
205-
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
206-
LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
207-
System.out.println(dnaSequences);
208-
209-
String crazyFile = "src/test/resources/CraftedFeature.gb";
210-
is = new FileInputStream(crazyFile);
211-
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
212-
LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();
213-
214-
is.close();
215-
System.out.println(crazyAnnotatedSequences);
216-
}
217-
218195
}
219196

biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@
3232
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
3333
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
3434
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
35-
import org.biojava.nbio.core.sequence.features.*;
35+
import org.biojava.nbio.core.sequence.features.AbstractFeature;
36+
import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
37+
import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
38+
import org.biojava.nbio.core.sequence.features.FeatureRetriever;
39+
import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
3640
import org.biojava.nbio.core.sequence.io.GenbankSequenceParser;
3741
import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser;
3842
import org.biojava.nbio.core.sequence.template.AbstractSequence;
@@ -41,7 +45,14 @@
4145
import org.slf4j.Logger;
4246
import org.slf4j.LoggerFactory;
4347

44-
import java.io.*;
48+
import java.io.BufferedInputStream;
49+
import java.io.BufferedReader;
50+
import java.io.File;
51+
import java.io.FileInputStream;
52+
import java.io.FileOutputStream;
53+
import java.io.IOException;
54+
import java.io.InputStream;
55+
import java.io.InputStreamReader;
4556
import java.net.URL;
4657
import java.net.URLConnection;
4758
import java.util.ArrayList;
@@ -54,7 +65,7 @@
5465
*/
5566
public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever {
5667

57-
private final static Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class);
68+
private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class);
5869

5970
private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; //
6071
private String genbankDirectoryCache = null;

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,6 @@
2020
*/
2121
package org.biojava.nbio.core.sequence.io;
2222

23-
import java.io.IOException;
24-
import java.io.InputStream;
25-
import java.util.ArrayList;
26-
import java.util.LinkedHashMap;
27-
import java.util.List;
28-
import java.util.Map;
29-
3023
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
3124
import org.biojava.nbio.core.sequence.DNASequence;
3225
import org.biojava.nbio.core.sequence.ProteinSequence;
@@ -46,8 +39,20 @@
4639
import org.slf4j.Logger;
4740
import org.slf4j.LoggerFactory;
4841

42+
import java.io.BufferedInputStream;
43+
import java.io.IOException;
44+
import java.io.InputStream;
45+
import java.util.ArrayList;
46+
import java.util.LinkedHashMap;
47+
import java.util.List;
48+
import java.util.Map;
49+
4950
import static org.hamcrest.CoreMatchers.is;
50-
import static org.junit.Assert.*;
51+
import static org.junit.Assert.assertEquals;
52+
import static org.junit.Assert.assertFalse;
53+
import static org.junit.Assert.assertNotNull;
54+
import static org.junit.Assert.assertThat;
55+
import static org.junit.Assert.assertTrue;
5156

5257
/**
5358
*
@@ -161,7 +166,7 @@ public void testProcess() throws Exception {
161166
*/
162167
@Test
163168
public void testPartialProcess() throws IOException, CompoundNotFoundException, NoSuchFieldException {
164-
InputStream inStream = this.getClass().getResourceAsStream("/two-dnaseqs.gb");
169+
CheckableInputStream inStream = new CheckableInputStream(this.getClass().getResourceAsStream("/two-dnaseqs.gb"));
165170

166171
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
167172
= new GenbankReader<>(
@@ -173,27 +178,29 @@ public void testPartialProcess() throws IOException, CompoundNotFoundException,
173178
// First call to process(1) returns the first sequence
174179
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process(1);
175180

181+
assertFalse(inStream.isclosed());
176182
assertNotNull(dnaSequences);
177183
assertEquals(1, dnaSequences.size());
178184
assertNotNull(dnaSequences.get("vPetite"));
179185

180186
// Second call to process(1) returns the second sequence
181187
dnaSequences = genbankDNA.process(1);
188+
assertFalse(inStream.isclosed());
182189
assertNotNull(dnaSequences);
183190
assertEquals(1, dnaSequences.size());
184191
assertNotNull(dnaSequences.get("sbFDR"));
185192

186193
assertFalse(genbankDNA.isClosed());
187194
genbankDNA.close();
188195
assertTrue(genbankDNA.isClosed());
189-
196+
assertTrue(inStream.isclosed());
190197
}
191198

192199
@Test
193200
public void CDStest() throws Exception {
194201
logger.info("CDS Test");
195202

196-
InputStream inStream = this.getClass().getResourceAsStream("/BondFeature.gb");
203+
CheckableInputStream inStream = new CheckableInputStream(this.getClass().getResourceAsStream("/BondFeature.gb"));
197204
assertNotNull(inStream);
198205

199206
GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
@@ -203,7 +210,7 @@ public void CDStest() throws Exception {
203210
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
204211
);
205212
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
206-
inStream.close();
213+
assertTrue(inStream.isclosed());
207214

208215

209216
Assert.assertTrue(proteinSequences.size() == 1);
@@ -260,4 +267,27 @@ public void testNcbiExpandedAccessionFormats() throws Exception {
260267
DNASequence header2 = readGenbankResource("/empty_header2.gb");
261268
assertEquals("AZZZAA02123456789 10000000000 bp DNA linear PRI 15-OCT-2018", header2.getOriginalHeader());
262269
}
270+
271+
/**
272+
* Helper class to be able to verify the closed state of the input stream.
273+
*/
274+
private class CheckableInputStream extends BufferedInputStream {
275+
276+
private boolean closed;
277+
278+
CheckableInputStream(InputStream in) {
279+
super(in);
280+
closed = false;
281+
}
282+
283+
@Override
284+
public void close() throws IOException {
285+
super.close();
286+
closed = true;
287+
}
288+
289+
boolean isclosed() {
290+
return closed;
291+
}
292+
}
263293
}

0 commit comments

Comments
 (0)