Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@

import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;

import java.io.File;
import java.io.FileInputStream;
Expand Down Expand Up @@ -68,6 +70,34 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file,

}

/**
* Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
* that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
* in one sequence based on accession id.
* @param file
* @param lazySequenceLoad
* @return
* @throws IOException
*/
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(File file, boolean lazySequenceLoad) throws IOException {
if (!lazySequenceLoad) {
return readFastaRNASequence(file);
}

FastaReader<RNASequence, NucleotideCompound> fastaProxyReader =
new FastaReader<RNASequence, NucleotideCompound>(
file,
new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
new FileProxyRNASequenceCreator(
file,
RNACompoundSet.getRNACompoundSet(),
new FastaSequenceParser()
)
);
return fastaProxyReader.process();

}

/**
* Read a fasta file containing amino acids with setup that would handle most
* cases.
Expand Down Expand Up @@ -130,6 +160,35 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
return dnaSequences;
}

/**
* Read a fasta RNA sequence
* @param inStream
* @return
* @throws IOException
*/
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
InputStream inStream) throws IOException {
FastaReader<RNASequence, NucleotideCompound> fastaReader = new FastaReader<RNASequence, NucleotideCompound>(
inStream,
new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
return fastaReader.process();
}

/**
*
* @param file
* @return
* @throws IOException
*/
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
File file) throws IOException {
FileInputStream inStream = new FileInputStream(file);
LinkedHashMap<String, RNASequence> rnaSequences = readFastaRNASequence(inStream);
inStream.close();
return rnaSequences;
}

public static void main(String[] args) throws Exception {

LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna"));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created on 01-21-2010
*/
package org.biojava.nbio.core.sequence.io;

import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
import org.biojava.nbio.core.sequence.loader.SequenceFileProxyLoader;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.CompoundSet;
import org.biojava.nbio.core.sequence.template.ProxySequenceReader;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
* This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
* the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
* This way you can load very large fasta files and store accession id and delay loading the sequence to save
* memory. The index is the file stream offset so when a RNASequence has a call to getSequence() the
* SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
*
* Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
* interface to a remote server
*
* @author Scooter Willis <willishf at gmail dot com>
*/
public class FileProxyRNASequenceCreator implements
SequenceCreatorInterface<NucleotideCompound> {

CompoundSet<NucleotideCompound> compoundSet = null;
File file = null;
SequenceParserInterface sequenceParser;

/**
* Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
* @param fastaFile
* @param compoundSet
*/
public FileProxyRNASequenceCreator(File file,
CompoundSet<NucleotideCompound> compoundSet,
SequenceParserInterface sequenceParser) {
this.compoundSet = compoundSet;
this.file = file;
this.sequenceParser = sequenceParser;
}

/**
* Even though we are passing in the sequence we really only care about the length of the sequence and the offset
* index in the fasta file.
* @param sequence
* @param index
* @return
* @throws CompoundNotFoundException
* @throws IOException
*/
@Override
public AbstractSequence<NucleotideCompound> getSequence(String sequence, long index ) throws CompoundNotFoundException, IOException {
SequenceFileProxyLoader<NucleotideCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<NucleotideCompound>(
file,
sequenceParser,
index,
sequence.length(),
compoundSet);
return new RNASequence(sequenceFileProxyLoader, compoundSet);
}

/**
* Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
* @param proxyLoader
* @param index
* @return
*/
@Override
public AbstractSequence<NucleotideCompound> getSequence(
ProxySequenceReader<NucleotideCompound> proxyLoader, long index) {
throw new UnsupportedOperationException("Not supported yet.");
}

/**
* Not sure of use case and currently not supported
* @param list
* @return
*/
@Override
public AbstractSequence<NucleotideCompound> getSequence(
List<NucleotideCompound> list) {
throw new UnsupportedOperationException("Not supported yet.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@

import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -99,6 +101,35 @@ public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
return GenbankProxyReader.process();

}

/**
* Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
* that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
* in one sequence based on accession id.
* @param file
* @param lazySequenceLoad
* @return
* @throws Exception
*/
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception {
if (!lazySequenceLoad) {
return readGenbankRNASequence(file);
}

GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader =
new GenbankReader<RNASequence, NucleotideCompound>(
file,
new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
new FileProxyRNASequenceCreator(
file,
RNACompoundSet.getRNACompoundSet(),
new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
)
);
return GenbankProxyReader.process();

}

/**
* Read a Genbank file containing amino acids with setup that would handle most
* cases.
Expand Down Expand Up @@ -160,6 +191,34 @@ public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
inStream.close();
return dnaSequences;
}
/**
* Read a Genbank RNA sequence
* @param inStream
* @return
* @throws Exception
*/
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
InputStream inStream) throws Exception {
GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<RNASequence, NucleotideCompound>(
inStream,
new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
return GenbankReader.process();
}

/**
*
* @param file
* @return
* @throws Exception
*/
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
File file) throws Exception {
FileInputStream inStream = new FileInputStream(file);
LinkedHashMap<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream);
inStream.close();
return rnaSequences;
}

public static void main(String[] args) throws Exception {

Expand Down