-
Notifications
You must be signed in to change notification settings - Fork 397
Add option to return sequences from the Genbank file as a stream of sequences #870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,6 +47,7 @@ | |
| import java.util.HashMap; | ||
| import java.util.LinkedHashMap; | ||
| import java.util.List; | ||
| import java.util.stream.Stream; | ||
|
|
||
| /** | ||
| * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the | ||
|
|
@@ -153,35 +154,56 @@ public LinkedHashMap<String,S> process(final int max) throws IOException, Compou | |
| } | ||
|
|
||
| LinkedHashMap<String,S> sequences = new LinkedHashMap<>(); | ||
| @SuppressWarnings("unchecked") | ||
| int i=0; | ||
| while(true) { | ||
| if(max>0 && i>=max) break; | ||
| i++; | ||
| String seqString = genbankParser.getSequence(bufferedReader, 0); | ||
| //reached end of file? | ||
| if(seqString==null) break; | ||
| @SuppressWarnings("unchecked") | ||
| S sequence = (S) sequenceCreator.getSequence(seqString, 0); | ||
| genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); | ||
|
|
||
| // add features to new sequence | ||
| genbankParser.getFeatures().values().stream() | ||
| .flatMap(List::stream) | ||
| .forEach(sequence::addFeature); | ||
|
|
||
| // add taxonomy ID to new sequence | ||
| ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); | ||
| if (dbQualifier != null){ | ||
| DBReferenceInfo q = dbQualifier.get(0); | ||
| sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); | ||
| S sequence = getSequence(); | ||
| if(null == sequence) { | ||
| break; | ||
| } | ||
|
|
||
| sequences.put(sequence.getAccession().getID(), sequence); | ||
| } | ||
|
|
||
| return sequences; | ||
| } | ||
|
|
||
| public Stream<S> getSequencesAsStream() { | ||
| return Stream.generate(() -> { | ||
| try { | ||
| return getSequence(); | ||
| } catch (IOException | CompoundNotFoundException e) { | ||
| // TODO Auto-generated catch block | ||
| e.printStackTrace(); | ||
| return null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather throw the exception forward and let the caller handle it. Returning null means all callers have to handle the null and that they lose the ability to provide a good error message. |
||
| } | ||
| }); | ||
| } | ||
|
|
||
| private S getSequence() throws IOException, CompoundNotFoundException { | ||
| String seqString = genbankParser.getSequence(bufferedReader, 0); | ||
| //reached end of file? | ||
| if(seqString==null) { | ||
| return null; | ||
| } | ||
| @SuppressWarnings("unchecked") | ||
| S sequence = (S) sequenceCreator.getSequence(seqString, 0); | ||
| genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); | ||
|
|
||
| // add features to new sequence | ||
| genbankParser.getFeatures().values().stream() | ||
| .flatMap(List::stream) | ||
| .forEach(sequence::addFeature); | ||
|
|
||
| // add taxonomy ID to new sequence | ||
| ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); | ||
| if (dbQualifier != null){ | ||
| DBReferenceInfo q = dbQualifier.get(0); | ||
| sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); | ||
| } | ||
|
|
||
| return sequence; | ||
| } | ||
|
|
||
| public void close() { | ||
| try { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,7 @@ | |
| import java.util.LinkedHashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.stream.Stream; | ||
|
|
||
| import static org.hamcrest.CoreMatchers.is; | ||
| import static org.junit.Assert.assertEquals; | ||
|
|
@@ -163,6 +164,27 @@ public void testProcess() throws Exception { | |
| assertEquals(3, dnaSequence.getAccession().getVersion().intValue()); | ||
| assertTrue(genbankDNA.isClosed()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSequenceStream() { | ||
| CheckableInputStream inStream = new CheckableInputStream(this.getClass().getResourceAsStream("/two-dnaseqs.gb")); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you also include this file in the test resources directory? It seems not to be in this PR |
||
|
|
||
| GenbankReader<DNASequence, NucleotideCompound> genbankDNA | ||
| = new GenbankReader<>( | ||
| inStream, | ||
| new GenericGenbankHeaderParser<>(), | ||
| new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()) | ||
| ); | ||
|
|
||
| Stream<DNASequence> seqStream = genbankDNA.getSequencesAsStream(); | ||
| assertEquals(seqStream.count(),2); | ||
|
|
||
| assertFalse(genbankDNA.isClosed()); | ||
| genbankDNA.close(); | ||
| assertTrue(genbankDNA.isClosed()); | ||
| assertTrue(inStream.isclosed()); | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * Test the process method with a number of sequences to be read at each call. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is what is hanging the build. Because the stream returned is null (the file does not exist, see comment below), sequence is always null and this while loop never finishes.
The while loop is not needed anymore, simply take an action if the sequence is null.