Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
BioJava Changelog
-----------------

BioJava 7.1.0 - future release
==============================
### Added
* Class `FastaStreamer` to read FASTA-formatted files using Java streams


BioJava 7.0.2
==============================
### Added
Expand Down
6 changes: 3 additions & 3 deletions biojava-aa-prop/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>biojava-aa-prop</artifactId>
Expand Down Expand Up @@ -70,12 +70,12 @@
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-core</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-structure</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</dependency>

<!-- logging dependencies (managed by parent pom, don't set versions or scopes here) -->
Expand Down
4 changes: 2 additions & 2 deletions biojava-alignment/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<artifactId>biojava-alignment</artifactId>
<name>biojava-alignment</name>
Expand Down Expand Up @@ -47,7 +47,7 @@
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-core</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
Expand Down
2 changes: 1 addition & 1 deletion biojava-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>biojava-core</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package org.biojava.nbio.core.sequence.io;

import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
import org.biojava.nbio.core.util.InputStreamProvider;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
* Read from a FASTA file (or gzipped FASTA file) and create a Java stream of {@link ProteinSequence} objects
* for use in a functional programming paradigm.
*
* @author Gary Murphy
* @since 7.1.0
*/
public class FastaStreamer {

private final Path path;
private int batchSize = 1_000;
private SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser;
private SequenceCreatorInterface<AminoAcidCompound> sequenceCreator;
private LinkedHashMap<String, ProteinSequence> chunk = new LinkedHashMap<>();
private Iterator<Map.Entry<String, ProteinSequence>> iterator = Collections.emptyIterator();
private boolean closed = false;

/**
* The constructor is private. Created via the <tt>from(...)</tt> static factory method
*
* @param path the path to the file containing the FASTA content (possibly GZipped)
*/
private FastaStreamer(final Path path) {
this.path = path;
}

public static FastaStreamer from(final Path path) {
return new FastaStreamer(path);
}

public static FastaStreamer from(File file) {
return from(file.toPath());
}

public FastaStreamer withHeaderParser(SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser) {
this.headerParser = headerParser;
return this;
}

public FastaStreamer withSequenceCreator(SequenceCreatorInterface<AminoAcidCompound> sequenceCreator) {
this.sequenceCreator = sequenceCreator;
return this;
}

public FastaStreamer batchSize(int size) {
this.batchSize = size;
return this;
}

/**
* Enable iteration through the proteins in the file using syntax such as:
* <pre>
* for(ProteinSequence sequence : FastaStreamer.from(path).each()) {
* .
* .
* .
* }
* </pre>
*
* @return an iterable suitable for an iteration loop
*/
public Iterable<ProteinSequence> each() {
return () -> stream().iterator();
}

/**
* Create a stream of protein sequences from the contents of the path
* @return the stream
*/
public Stream<ProteinSequence> stream() {
InputStreamProvider provider = new InputStreamProvider();
InputStream input;
try {
input = provider.getInputStream(getPath().toFile());
} catch (IOException exception) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to throw the exception so that it can be handled at a higher level?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or if stream() doesn't permit a throws in the signature, then I'd advice to use an UncheckedIOException instead of RuntimeException

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exceptions would break stream chaining, as the contrived example below shows. I changed to UncheckedIOException. I originally had the exception being thrown, so it is allowed by Java, but it caused me problems when I used it in my code, so I get where you are coming from.

		String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
		String file2 = this.getClass().getResource("PF00105_small.fasta.gz").getFile();
		List<Path> paths = List.of(Paths.get(file), Paths.get(file2));
		List<ProteinSequence> sequences = paths
				.stream()
				.flatMap(path -> FastaStreamer.from(path).stream()) // <-- Not allowed if exception
				.collect(Collectors.toList());

throw new UncheckedIOException(exception);
}
FastaReader<ProteinSequence, AminoAcidCompound> reader = new FastaReader<>(input, getHeaderParser(), getSequenceCreator());
Spliterator<ProteinSequence> source = new Spliterators.AbstractSpliterator<>(Integer.MAX_VALUE, Spliterator.IMMUTABLE | Spliterator.NONNULL) {
@Override
public boolean tryAdvance(Consumer<? super ProteinSequence> action) {
if (closed) {
return false;
}
ProteinSequence protein = next(reader);
if (null == protein) {
return false;
}
action.accept(protein);
return true;
}

/**
* Fetch the next header/protein tuple from the cache. If the cache is empty, fetch another
* batch from the source file
*
* @param reader
* the input stream from which the FASTA content is read
* @return the protein sequence
*/
private ProteinSequence next(FastaReader<ProteinSequence, AminoAcidCompound> reader) {
try {
if (!iterator.hasNext()) {
chunk = reader.process(getBatchSize());
if (null == chunk) {
closed = true;
reader.close();
return null;
}
iterator = chunk.entrySet().iterator();
}
if (iterator.hasNext()) {
Map.Entry<String, ProteinSequence> entry = iterator.next();
return createSequence(entry.getValue());
}
closed = true;
reader.close();
} catch (IOException exception) {
throw new UncheckedIOException(String.format("I/O error reading the FASTA file from '%s'", getPath()), exception);
}
return null;
}
}; // Spliterator
return StreamSupport.stream(source, false);
}

/**
* Create the sequence with the information from the header. This implementation return the sequence as-is, but
* this is an opportunity for the implementer to build specific information into the user collection space
* of the sequence
*
* @param sequence the protein sequence
* @return the sequence
*/
protected ProteinSequence createSequence(ProteinSequence sequence) {
return sequence;
}

protected Path getPath() {
return path;
}

protected int getBatchSize() {
return batchSize;
}

protected SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> getHeaderParser() {
return Optional.ofNullable(headerParser).orElse(new GenericFastaHeaderParser<>());
}

public SequenceCreatorInterface<AminoAcidCompound> getSequenceCreator() {
return Optional.ofNullable(sequenceCreator).orElse(new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.biojava.nbio.core.sequence.io;

import org.biojava.nbio.core.sequence.ProteinSequence;
import org.junit.Assert;
import org.junit.Test;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;

/**
* Test the functionality of the {@link FastaStreamer} code
*/
public class FastaStreamerTest {

@Test
public void stream() throws IOException {
String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
Path path = Paths.get(file);
List<ProteinSequence> sequences;

sequences = FastaStreamer.from(path).stream().collect(Collectors.toList());
Assert.assertEquals("Count", 283, sequences.size());

ProteinSequence sequence;
sequence = sequences.get(0);
Assert.assertEquals("A2D504_ATEGE/1-46", sequence.getOriginalHeader());
sequence = sequences.get(sequences.size()-1);
Assert.assertEquals("Q98SJ1_CHICK/15-61", sequence.getOriginalHeader());

sequences = FastaStreamer.from(path)
.batchSize(2) // Ensure there isn't an edge condition loading the next buffer
.stream()
.collect(Collectors.toList());
Assert.assertEquals("Count", 283, sequences.size());
}

@Test
public void iterate() {
String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
Path path = Paths.get(file);
int count = 0;
for (ProteinSequence sequence : FastaStreamer.from(path).each()) {
count++;
}
Assert.assertEquals("Count", 283, count);
}
}
Binary file not shown.
Binary file not shown.
6 changes: 3 additions & 3 deletions biojava-genome/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>biojava-genome</artifactId>
Expand Down Expand Up @@ -71,13 +71,13 @@
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-core</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-alignment</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
Expand Down
4 changes: 2 additions & 2 deletions biojava-integrationtest/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<artifactId>biojava-integrationtest</artifactId>
<packaging>jar</packaging>
Expand Down Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-structure</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</dependency>
<!-- logging dependencies (managed by parent pom, don't set versions or scopes here) -->
<dependency>
Expand Down
4 changes: 2 additions & 2 deletions biojava-modfinder/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<artifactId>biojava</artifactId>
<groupId>org.biojava</groupId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>
<artifactId>biojava-modfinder</artifactId>
<name>biojava-modfinder</name>
Expand All @@ -31,7 +31,7 @@
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-structure</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
Expand Down
2 changes: 1 addition & 1 deletion biojava-ontology/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<groupId>org.biojava</groupId>
<artifactId>biojava</artifactId>
<version>7.0.3-SNAPSHOT</version>
<version>7.1.0-SNAPSHOT</version>
</parent>

<artifactId>biojava-ontology</artifactId>
Expand Down
Loading