Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
Expand All @@ -55,20 +57,28 @@ public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {

private SequenceCreatorInterface<C> sequenceCreator;
private GenbankSequenceParser<S,C> genbankParser;
private InputStream inputStream;
private BufferedReader bufferedReader;
private boolean closed;
private final Logger logger = LoggerFactory.getLogger(this.getClass());

public boolean isClosed() {
return closed;
}

/**
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
* an inputstream is forced to read all the data so you don't gain anything.
* @param br
* @param is
* @param headerParser
* @param sequenceCreator
*/
public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
final SequenceCreatorInterface<C> sequenceCreator) {
this.sequenceCreator = sequenceCreator;
this.inputStream = is;
genbankParser = new GenbankSequenceParser<S,C>();
bufferedReader = new BufferedReader(new InputStreamReader(is));
genbankParser = new GenbankSequenceParser<>();
closed = false;
}

/**
Expand All @@ -85,14 +95,14 @@ public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerPa
* method denies read access to the file.
*/
public GenbankReader(
File file,
SequenceHeaderParserInterface<S,C> headerParser,
SequenceCreatorInterface<C> sequenceCreator
final File file,
final SequenceHeaderParserInterface<S,C> headerParser,
final SequenceCreatorInterface<C> sequenceCreator
) throws FileNotFoundException {

inputStream = new FileInputStream(file);
this.bufferedReader = new BufferedReader(new FileReader(file));
this.sequenceCreator = sequenceCreator;
genbankParser = new GenbankSequenceParser<S,C>();
genbankParser = new GenbankSequenceParser<>();
}

/**
Expand All @@ -108,8 +118,7 @@ public GenbankReader(
* @throws CompoundNotFoundException
*/
public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
LinkedHashMap<String,S> sequences = process(-1);
return sequences;
return process(-1);
}

/**
Expand All @@ -122,7 +131,7 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
* time before the first result is available.<br>
* <b>N.B.</b>
* <ul>
* <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
* <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
* <li>remember to close the underlying resource when you are done.</li>
* </ul>
* @see #process()
Expand All @@ -134,17 +143,17 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
* @throws IOException
* @throws CompoundNotFoundException
*/
public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException {
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
@SuppressWarnings("unchecked")
int i=0;
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
while(true) {
if(max>0 && i>=max) break;
i++;
String seqString = genbankParser.getSequence(br, 0);
String seqString = genbankParser.getSequence(bufferedReader, 0);
//reached end of file?
if(seqString==null) break;
@SuppressWarnings("unchecked")
S sequence = (S) sequenceCreator.getSequence(seqString, 0);
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);

Expand All @@ -165,32 +174,41 @@ public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotF

sequences.put(sequence.getAccession().getID(), sequence);
}
br.close();
close();

if (max < 0) {
close();
}

return sequences;
}

public void close() throws IOException {
inputStream.close();
public void close() {
try {
bufferedReader.close();
this.closed = true;
} catch (IOException e) {
logger.error("Couldn't close the reader. {}", e.getMessage());
this.closed = false;
}
}

public static void main(String[] args) throws Exception {
String proteinFile = "src/test/resources/BondFeature.gb";
FileInputStream is = new FileInputStream(proteinFile);

GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
System.out.println(proteinSequences);

String inputFile = "src/test/resources/NM_000266.gb";
is = new FileInputStream(inputFile);
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
System.out.println(dnaSequences);

String crazyFile = "src/test/resources/CraftedFeature.gb";
is = new FileInputStream(crazyFile);
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();

is.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
*/
package org.biojava.nbio.core.sequence.io;

import static org.junit.Assert.assertNotNull;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
Expand All @@ -46,10 +46,13 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.junit.Assert.*;

/**
*
* @author Scooter Willis <willishf at gmail dot com>
* @author Jacek Grzebyta
* @author Philippe Soares
*/
public class GenbankReaderTest {

Expand Down Expand Up @@ -84,31 +87,83 @@ public void testProcess() throws Exception {
InputStream inStream = this.getClass().getResourceAsStream("/BondFeature.gb");
assertNotNull(inStream);

GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
GenbankReader<ProteinSequence, AminoAcidCompound> genbankProtein
= new GenbankReader<>(
inStream,
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
new GenericGenbankHeaderParser<>(),
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
);
@SuppressWarnings("unused")
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
inStream.close();

LinkedHashMap<String, ProteinSequence> proteinSequences = genbankProtein.process();

assertNotNull(proteinSequences);
assertEquals(1, proteinSequences.size());

ProteinSequence proteinSequence = proteinSequences.get("NP_000257");
assertNotNull(proteinSequences.get("NP_000257"));
assertEquals("NP_000257", proteinSequence.getAccession().getID());
assertEquals("4557789", proteinSequence.getAccession().getIdentifier());
assertEquals("GENBANK", proteinSequence.getAccession().getDataSource().name());
assertEquals(1, proteinSequence.getAccession().getVersion().intValue());
assertTrue(genbankProtein.isClosed());

logger.info("process DNA");
inStream = this.getClass().getResourceAsStream("/NM_000266.gb");
assertNotNull(inStream);

GenbankReader<DNASequence, NucleotideCompound> GenbankDNA
= new GenbankReader<DNASequence, NucleotideCompound>(
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
= new GenbankReader<>(
inStream,
new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
new GenericGenbankHeaderParser<>(),
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
);
@SuppressWarnings("unused")
LinkedHashMap<String, DNASequence> dnaSequences = GenbankDNA.process();
inStream.close();
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process();

assertNotNull(dnaSequences);
assertEquals(1, dnaSequences.size());

DNASequence dnaSequence = dnaSequences.get("NM_000266");
assertNotNull(dnaSequences.get("NM_000266"));
assertEquals("NM_000266", dnaSequence.getAccession().getID());
assertEquals("223671892", dnaSequence.getAccession().getIdentifier());
assertEquals("GENBANK", dnaSequence.getAccession().getDataSource().name());
assertEquals(3, dnaSequence.getAccession().getVersion().intValue());
assertTrue(genbankDNA.isClosed());
}

/**
* Test the process method with a number of sequences to be read at each call.
* The underlying {@link InputStream} should remain open until the last call.
*/
@Test
public void testPartialProcess() throws IOException, CompoundNotFoundException, NoSuchFieldException {
InputStream inStream = this.getClass().getResourceAsStream("/two-dnaseqs.gb");

GenbankReader<DNASequence, NucleotideCompound> genbankDNA
= new GenbankReader<>(
inStream,
new GenericGenbankHeaderParser<>(),
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
);

// First call to process(1) returns the first sequence
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process(1);

assertNotNull(dnaSequences);
assertEquals(1, dnaSequences.size());
assertNotNull(dnaSequences.get("vPetite"));

// Second call to process(1) returns the second sequence
dnaSequences = genbankDNA.process(1);
assertNotNull(dnaSequences);
assertEquals(1, dnaSequences.size());
assertNotNull(dnaSequences.get("sbFDR"));

assertFalse(genbankDNA.isClosed());
genbankDNA.close();
assertTrue(genbankDNA.isClosed());

}

@Test
public void CDStest() throws Exception {
Expand All @@ -118,9 +173,9 @@ public void CDStest() throws Exception {
assertNotNull(inStream);

GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
= new GenbankReader<>(
inStream,
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
new GenericGenbankHeaderParser<>(),
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
);
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
Expand All @@ -130,7 +185,7 @@ public void CDStest() throws Exception {
Assert.assertTrue(proteinSequences.size() == 1);
logger.debug("protein sequences: {}", proteinSequences);

ProteinSequence protein = new ArrayList<ProteinSequence>(proteinSequences.values()).get(0);
ProteinSequence protein = new ArrayList<>(proteinSequences.values()).get(0);

FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound> cdsFeature = protein.getFeaturesByType("CDS").get(0);
String codedBy = cdsFeature.getQualifiers().get("coded_by").get(0).getValue();
Expand All @@ -139,8 +194,8 @@ public void CDStest() throws Exception {

Assert.assertNotNull(codedBy);
Assert.assertTrue(!codedBy.isEmpty());
Assert.assertEquals(codedBy, "NM_000266.2:503..904");
Assert.assertEquals(5, dbrefs.size());
assertEquals(codedBy, "NM_000266.2:503..904");
assertEquals(5, dbrefs.size());

}

Expand Down