-
Notifications
You must be signed in to change notification settings - Fork 395
Add an EMBL file parser to BioJava #621 #713
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
68dffbb
add classes of new feature parser for embl file
3dc43b4
t
fc23c95
t
9976d2c
testing embl parser
1eb16b2
Merge branch 'master' of https://github.com/biojava/biojava
d3e8e83
first version of embl parser
d49b3f9
pull changes request
NoorAldeenMB 79f0ed4
Embl Changes
NoorAldeenMB d24230a
Embl Changes
NoorAldeenMB 141a8da
Embl changes
NoorAldeenMB 9673da4
using resource loader in testing.
NoorAldeenMB 40ddf40
insert license header
NoorAldeenMB File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
113 changes: 113 additions & 0 deletions
113
biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/embl/EmblId.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| /* | ||
| * BioJava development code | ||
| * | ||
| * This code may be freely distributed and modified under the | ||
| * terms of the GNU Lesser General Public Licence. This should | ||
| * be distributed with the code. If you do not have a copy, | ||
| * see: | ||
| * | ||
| * http://www.gnu.org/copyleft/lesser.html | ||
| * | ||
| * Copyright for this code is held jointly by the individual | ||
| * authors. These should be listed in @author doc comments. | ||
| * | ||
| * For more information on the BioJava project and its aims, | ||
| * or to join the biojava-l mailing list, visit the home page | ||
| * at: | ||
| * | ||
| * http://www.biojava.org/ | ||
| * | ||
| */ | ||
| package org.biojava.nbio.core.sequence.io.embl; | ||
|
|
||
| import jdk.nashorn.internal.ir.annotations.Immutable; | ||
|
|
||
| /** | ||
| * This class contains the processed data of embl file | ||
| * Primary accession number | ||
| * Sequence version number | ||
| * Topology: 'circular' or 'linear' | ||
| * Molecule type | ||
| * Data class | ||
| * Taxonomic division | ||
| * Sequence length | ||
| * | ||
| * @author Noor Aldeen Al Mbaidin | ||
| * @since 5.0.0 | ||
| */ | ||
| @Immutable | ||
| public class EmblId { | ||
|
|
||
|
|
||
| private final String primaryAccession; | ||
| private final String sequenceVersion; | ||
| private final String topology; | ||
| private final String moleculeType; | ||
| private final String dataClass; | ||
| private final String taxonomicDivision; | ||
| private final String sequenceLength; | ||
|
|
||
| public EmblId(String primaryAccession, String sequenceVersion, String topology, | ||
| String moleculeType, String dataClass, String taxonomicDivision, | ||
| String sequenceLength) { | ||
| this.primaryAccession = primaryAccession; | ||
| this.sequenceVersion = sequenceVersion; | ||
| this.topology = topology; | ||
| this.moleculeType = moleculeType; | ||
| this.dataClass = dataClass; | ||
| this.taxonomicDivision = taxonomicDivision; | ||
| this.sequenceLength = sequenceLength; | ||
| } | ||
|
|
||
| /** | ||
| * @return String | ||
| */ | ||
| public String getPrimaryAccession() { | ||
| return primaryAccession; | ||
| } | ||
|
|
||
| /** | ||
| * return the sequence version | ||
| * | ||
| * @return String | ||
| */ | ||
| public String getSequenceVersion() { | ||
| return sequenceVersion; | ||
| } | ||
|
|
||
| public String getTopology() { | ||
| return topology; | ||
| } | ||
|
|
||
| /** | ||
| * Molecule type this represents the type of molecule as stored | ||
| * | ||
| * @return String | ||
| */ | ||
| public String getMoleculeType() { | ||
| return moleculeType; | ||
| } | ||
|
|
||
| public String getDataClass() { | ||
| return dataClass; | ||
| } | ||
|
|
||
| /** | ||
| * @return String | ||
| */ | ||
| public String getTaxonomicDivision() { | ||
| return taxonomicDivision; | ||
| } | ||
|
|
||
| /** | ||
| * Sequence length The last item on the ID line is the length of the | ||
| * sequence (the total number of bases in the sequence). This number includes | ||
| * base positions reported as present but undetermined (coded as "N"). | ||
| * | ||
| * @return String | ||
| */ | ||
| public String getSequenceLength() { | ||
| return sequenceLength; | ||
| } | ||
|
|
||
| } | ||
163 changes: 163 additions & 0 deletions
163
biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/embl/EmblReader.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| /* | ||
| * BioJava development code | ||
| * | ||
| * This code may be freely distributed and modified under the | ||
| * terms of the GNU Lesser General Public Licence. This should | ||
| * be distributed with the code. If you do not have a copy, | ||
| * see: | ||
| * | ||
| * http://www.gnu.org/copyleft/lesser.html | ||
| * | ||
| * Copyright for this code is held jointly by the individual | ||
| * authors. These should be listed in @author doc comments. | ||
| * | ||
| * For more information on the BioJava project and its aims, | ||
| * or to join the biojava-l mailing list, visit the home page | ||
| * at: | ||
| * | ||
| * http://www.biojava.org/ | ||
| * | ||
| */ | ||
| package org.biojava.nbio.core.sequence.io.embl; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All source code in biojava requires the license header
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
|
|
||
|
|
||
| import java.io.*; | ||
| import java.util.Arrays; | ||
| import java.util.LinkedList; | ||
|
|
||
|
|
||
| /** | ||
| * This class should process the data of embl file | ||
| * | ||
| * @author Noor Aldeen Al Mbaidin | ||
| * @since 5.0.0 | ||
| */ | ||
| public class EmblReader { | ||
|
|
||
| /** | ||
| * The parsing is done in this method.<br> | ||
| * This method tries to process all the Embl records | ||
| * in the File , closes the underlying resource, | ||
| * and return the results in object of EmblRecord.<br> | ||
| * | ||
| * @return EmblRecord containing all the parsed Embl records | ||
| * @throws IOException | ||
| */ | ||
| public static EmblRecord process(File file) throws IOException { | ||
|
|
||
| EmblRecord emblRecord = new EmblRecord(); | ||
| StringBuilder sequence = new StringBuilder(""); | ||
| LinkedList<EmblReference> emblReferences = new LinkedList<>(); | ||
| EmblReference emblReference = new EmblReference(); | ||
| LinkedList<String> accessionNumber = new LinkedList<>(); | ||
| LinkedList<String> keyword = new LinkedList<>(); | ||
|
|
||
| if (file == null) | ||
| throw new NullPointerException("file can't be null"); | ||
|
|
||
| if (file.isDirectory()) | ||
| throw new IllegalArgumentException("the file can't be a directory"); | ||
|
|
||
| try (FileReader fileReader = new FileReader(file)) { | ||
| String line = ""; | ||
| String lineIdentifier; | ||
| String lineInfo; | ||
| try (BufferedReader bufferedReader = new BufferedReader(fileReader)) { | ||
| while ((line = bufferedReader.readLine()) != null) { | ||
| if (line.length() > 1) { | ||
| lineInfo = line.substring(2, line.length()).trim(); | ||
| lineIdentifier = line.substring(0, 2); | ||
| if (lineIdentifier.equals("ID")) | ||
| emblRecord.setEmblId(populateID(lineInfo)); | ||
| else if (lineIdentifier.equals("AC")) | ||
| populateAccessionNumber(line, accessionNumber); | ||
| else if (lineIdentifier.equals("DT") && line.contains("Created")) | ||
| emblRecord.setCreatedDate(lineInfo); | ||
| else if (lineIdentifier.equals("DT") && line.contains("updated")) | ||
| emblRecord.setLastUpdatedDate(lineInfo); | ||
| else if (lineIdentifier.equals("DE")) | ||
| emblRecord.setSequenceDescription(lineInfo); | ||
| else if (lineIdentifier.equals("KW")) | ||
| keyword.add(lineInfo); | ||
| else if (lineIdentifier.equals("OS")) | ||
| emblRecord.setOrganismSpecies(lineInfo); | ||
| else if (lineIdentifier.equals("OC")) | ||
| emblRecord.setOrganismClassification(lineInfo); | ||
| else if (lineIdentifier.equals("OG")) | ||
| emblRecord.setOrGanelle(lineInfo); | ||
| else if (lineIdentifier.equals("RN") || lineIdentifier.equals("RP") | ||
| || lineIdentifier.equals("RX") || lineIdentifier.equals("RG") | ||
| || lineIdentifier.equals("RA") || lineIdentifier.equals("RT") | ||
| || lineIdentifier.equals("RL")) | ||
| populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences); | ||
| else if (lineIdentifier.equals("DR")) | ||
| emblRecord.setDatabaseCrossReference(lineInfo); | ||
| else if (lineIdentifier.equals("AH")) | ||
| emblRecord.setAssemblyHeader(lineInfo); | ||
| else if (lineIdentifier.equals("AS")) | ||
| emblRecord.setAssemblyInformation(lineInfo); | ||
| else if (lineIdentifier.equals("CO")) | ||
| emblRecord.setConstructedSequence(lineInfo); | ||
| else if (lineIdentifier.equals("FH")) | ||
| emblRecord.setFeatureHeader(lineInfo); | ||
| else if (lineIdentifier.equals("FT")) | ||
| emblRecord.setFeatureTable(lineInfo); | ||
| else if (lineIdentifier.equals("SQ")) | ||
| emblRecord.setSequenceHeader(lineInfo); | ||
| else if (lineIdentifier.equals(" ") && !lineIdentifier.equals("//")) | ||
| populateSequence(line, sequence); | ||
| else if (lineIdentifier.equals("//")) { | ||
| emblRecord.setKeyword(keyword); | ||
| emblRecord.setEmblReference(emblReferences); | ||
| emblRecord.setAccessionNumber(accessionNumber); | ||
| emblRecord.setSequence(sequence.toString()); | ||
| } | ||
|
|
||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return emblRecord; | ||
| } | ||
|
|
||
| private static void populateSequence(String line, StringBuilder sequence) { | ||
| String sequenceLine = line.replace(" ", ""). | ||
| replaceAll("[0-9]", ""); | ||
| sequence.append(sequenceLine); | ||
| } | ||
|
|
||
| private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference | ||
| , LinkedList<EmblReference> emblReferences) { | ||
| if (lineIdentifier.equals("RN")) | ||
| emblReference.setReferenceNumber(lineInfo); | ||
| else if (lineIdentifier.equals("RP")) | ||
| emblReference.setReferencePosition(lineInfo); | ||
| else if (lineIdentifier.equals("RX")) | ||
| emblReference.setReferenceCrossReference(lineInfo); | ||
| else if (lineIdentifier.equals("RG")) | ||
| emblReference.setReferenceGroup(lineInfo); | ||
| else if (lineIdentifier.equals("RA")) | ||
| emblReference.setReferenceAuthor(lineInfo); | ||
| else if (lineIdentifier.equals("RT")) | ||
| emblReference.setReferenceTitle(lineInfo); | ||
| else if (lineIdentifier.equals("RL")) { | ||
| emblReference.setReferenceLocation(lineInfo); | ||
| emblReferences.add(emblReference.copyEmblReference(emblReference)); | ||
| } | ||
| } | ||
|
|
||
| private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) { | ||
| accessionNumber.add(line); | ||
| } | ||
|
|
||
| private static EmblId populateID(String line) { | ||
| String[] strings = line.split(";"); | ||
| Arrays.stream(strings).map(String::trim).toArray(unused -> strings); | ||
| EmblId emblId = new EmblId(strings[0], strings[1], strings[2] | ||
| , strings[3], strings[4], strings[5], strings[6]); | ||
| return emblId; | ||
| } | ||
|
|
||
|
|
||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All source code in biojava requires the license header
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.