Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.StructureAlignment;
import org.biojava.nbio.structure.align.StructureAlignmentFactory;
Expand All @@ -45,6 +47,7 @@
import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl;
import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer;
import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer;
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits;
import org.biojava.nbio.structure.symmetry.internal.CESymmParameters;
import org.biojava.nbio.structure.symmetry.internal.CeSymm;
Expand All @@ -71,11 +74,10 @@
*/
public class SubunitCluster {

private static final Logger logger = LoggerFactory
.getLogger(SubunitCluster.class);
private static final Logger logger = LoggerFactory.getLogger(SubunitCluster.class);

private List<Subunit> subunits = new ArrayList<Subunit>();
private List<List<Integer>> subunitEQR = new ArrayList<List<Integer>>();
private List<Subunit> subunits = new ArrayList<>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you remove the Class of elements in the List when it is defined? Has something changed in Java8+?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Java 7 it is redundant to use the types in the variable initialization. They <> is called "diamond". See "The diamond" section here: https://docs.oracle.com/javase/tutorial/java/generics/types.html

I think it is good style to remove them, since they are redundant. It helps readability.

private List<List<Integer>> subunitEQR = new ArrayList<>();
private int representative = -1;

private SubunitClustererMethod method = SubunitClustererMethod.SEQUENCE;
Expand Down Expand Up @@ -119,7 +121,7 @@ public SubunitCluster(Subunit subunit) {

subunits.add(subunit);

List<Integer> identity = new ArrayList<Integer>();
List<Integer> identity = new ArrayList<>();
for (int i = 0; i < subunit.size(); i++)
identity.add(i);
subunitEQR.add(identity);
Expand Down Expand Up @@ -179,6 +181,38 @@ public boolean isIdenticalTo(SubunitCluster other) {
return thisSequence.equals(otherSequence);
}

/**
* Tells whether the other SubunitCluster contains exactly the same Subunit.
* This is checked by equality of their entity identifiers if they are present.
*
* @param other
* SubunitCluster
* @return true if the SubunitClusters are identical, false otherwise
*/
public boolean isIdenticalByEntityIdTo(SubunitCluster other) {
Structure thisStruct = this.subunits.get(this.representative).getStructure();
Structure otherStruct = other.subunits.get(other.representative).getStructure();
String thisName = this.subunits.get(this.representative).getName();
String otherName = other.subunits.get(this.representative).getName();
Chain thisChain = thisStruct.getChain(thisName);
Chain otherChain = otherStruct.getChain(otherName);
if (thisChain == null || otherChain == null) {
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
this.subunits.get(this.representative).getName(),
other.subunits.get(other.representative).getName());
return false;
}
if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) {
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
this.subunits.get(this.representative).getName(),
other.subunits.get(other.representative).getName());
return false;
}
int thisEntityId = thisChain.getEntityInfo().getMolId();
int otherEntityId = otherChain.getEntityInfo().getMolId();
return thisEntityId == otherEntityId;
}

/**
* Merges the other SubunitCluster into this one if it contains exactly the
* same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}.
Expand All @@ -192,7 +226,35 @@ public boolean mergeIdentical(SubunitCluster other) {
if (!isIdenticalTo(other))
return false;

logger.info("SubunitClusters are identical");
logger.info("SubunitClusters {}-{} are identical in sequence",
this.subunits.get(this.representative).getName(),
other.subunits.get(other.representative).getName());

this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);

return true;
}

/**
* Merges the other SubunitCluster into this one if it contains exactly the
* same Subunit. This is checked by comparing the entity identifiers of the subunits
* if one can be found.
* Thus this only makes sense when the subunits are complete chains of a
* deposited PDB entry. I
*
* @param other
* SubunitCluster
* @return true if the SubunitClusters were merged, false otherwise
*/
public boolean mergeIdenticalByEntityId(SubunitCluster other) {

if (!isIdenticalByEntityIdTo(other))
return false;

logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical",
this.subunits.get(this.representative).getName(),
other.subunits.get(other.representative).getName());

this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);
Expand Down Expand Up @@ -296,13 +358,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa
return false;
}

logger.info(String.format("SubunitClusters are similar in sequence "
+ "with %.2f sequence identity and %.2f coverage", sequenceIdentity,
sequenceCoverage));
logger.info(String.format("SubunitClusters %s-%s are similar in sequence "
+ "with %.2f sequence identity and %.2f coverage",
this.subunits.get(this.representative).getName(),
other.subunits.get(other.representative).getName(),
sequenceIdentity, sequenceCoverage));

// If coverage and sequence identity sufficient, merge other and this
List<Integer> thisAligned = new ArrayList<Integer>();
List<Integer> otherAligned = new ArrayList<Integer>();
List<Integer> thisAligned = new ArrayList<>();
List<Integer> otherAligned = new ArrayList<>();

// Extract the aligned residues of both Subunit
for (int p = 1; p < aligner.getPair().getLength() + 1; p++) {
Expand All @@ -318,60 +382,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa

// Only consider residues that are part of the SubunitCluster
if (this.subunitEQR.get(this.representative).contains(thisIndex)
&& other.subunitEQR.get(other.representative).contains(
otherIndex)) {
&& other.subunitEQR.get(other.representative).contains(otherIndex)) {
thisAligned.add(thisIndex);
otherAligned.add(otherIndex);
}
}

// Do a List intersection to find out which EQR columns to remove
List<Integer> thisRemove = new ArrayList<Integer>();
List<Integer> otherRemove = new ArrayList<Integer>();

for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
.get(t)))
thisRemove.add(t);
}

for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!otherAligned.contains(other.subunitEQR.get(
other.representative).get(t)))
otherRemove.add(t);
}
// Now remove unaligned columns, from end to start
Collections.sort(thisRemove);
Collections.reverse(thisRemove);
Collections.sort(otherRemove);
Collections.reverse(otherRemove);

for (int t = 0; t < thisRemove.size(); t++) {
for (List<Integer> eqr : this.subunitEQR) {
int column = thisRemove.get(t);
eqr.remove(column);
}
}

for (int t = 0; t < otherRemove.size(); t++) {
for (List<Integer> eqr : other.subunitEQR) {
int column = otherRemove.get(t);
eqr.remove(column);
}
}

// The representative is the longest sequence
if (this.subunits.get(this.representative).size() < other.subunits.get(
other.representative).size())
this.representative = other.representative + subunits.size();

this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);
updateEquivResidues(other, thisAligned, otherAligned);

this.method = SubunitClustererMethod.SEQUENCE;

pseudoStoichiometric = !params.isHighConfidenceScores(sequenceIdentity,sequenceCoverage);

return true;
Expand Down Expand Up @@ -445,8 +464,8 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p

// Merge clusters
List<List<Integer>> alignedRes = msa.getBlock(0).getAlignRes();
List<Integer> thisAligned = new ArrayList<Integer>();
List<Integer> otherAligned = new ArrayList<Integer>();
List<Integer> thisAligned = new ArrayList<>();
List<Integer> otherAligned = new ArrayList<>();

// Extract the aligned residues of both Subunit
for (int p = 0; p < msa.length(); p++) {
Expand All @@ -469,24 +488,30 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
}
}

updateEquivResidues(other, thisAligned, otherAligned);

this.method = SubunitClustererMethod.STRUCTURE;
pseudoStoichiometric = true;

return true;
}

private void updateEquivResidues(SubunitCluster other, List<Integer> thisAligned, List<Integer> otherAligned) {
// Do a List intersection to find out which EQR columns to remove
List<Integer> thisRemove = new ArrayList<Integer>();
List<Integer> otherRemove = new ArrayList<Integer>();
List<Integer> thisRemove = new ArrayList<>();
List<Integer> otherRemove = new ArrayList<>();

for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
.get(t)))
if (!thisAligned.contains(this.subunitEQR.get(this.representative).get(t)))
thisRemove.add(t);
}

for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!otherAligned.contains(other.subunitEQR.get(
other.representative).get(t)))
if (!otherAligned.contains(other.subunitEQR.get(other.representative).get(t)))
otherRemove.add(t);
}

// Now remove unaligned columns, from end to start
Collections.sort(thisRemove);
Collections.reverse(thisRemove);
Expand All @@ -508,17 +533,12 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
}

// The representative is the longest sequence
if (this.subunits.get(this.representative).size() < other.subunits.get(
other.representative).size())
if (this.subunits.get(this.representative).size() < other.subunits.get(other.representative).size())
this.representative = other.representative + subunits.size();

this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);

this.method = SubunitClustererMethod.STRUCTURE;
pseudoStoichiometric = true;

return true;
}

/**
Expand Down Expand Up @@ -568,9 +588,9 @@ public boolean divideInternally(SubunitClustererParameters clusterParams)
List<List<Integer>> alignedRes = result.getMultipleAlignment()
.getBlock(0).getAlignRes();

List<List<Integer>> columns = new ArrayList<List<Integer>>();
List<List<Integer>> columns = new ArrayList<>();
for (int s = 0; s < alignedRes.size(); s++)
columns.add(new ArrayList<Integer>(alignedRes.get(s).size()));
columns.add(new ArrayList<>(alignedRes.get(s).size()));

// Extract the aligned columns of each repeat in the Subunit
for (int col = 0; col < alignedRes.get(0).size(); col++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,8 @@ public static Stoichiometry cluster(Structure structure,
return cluster(subunits, params);
}

public static Stoichiometry cluster(List<Subunit> subunits,
SubunitClustererParameters params) {

// The collection of clusters to return
List<SubunitCluster> clusters = new ArrayList<SubunitCluster>();

public static Stoichiometry cluster(List<Subunit> subunits, SubunitClustererParameters params) {
List<SubunitCluster> clusters = new ArrayList<>();
if (subunits.size() == 0)
return new Stoichiometry(clusters);

Expand All @@ -75,7 +71,14 @@ public static Stoichiometry cluster(List<Subunit> subunits,
for (int c1 = 0; c1 < clusters.size(); c1++) {
for (int c2 = clusters.size() - 1; c2 > c1; c2--) {
try {
if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
if (params.isUseEntityIdForSeqIdentityDetermination() &&
clusters.get(c1).mergeIdenticalByEntityId(clusters.get(c2))) {
// This we will only do if the switch is for entity id comparison is on.
// In some cases it can save enormous amounts of time, e.g. for clustering full
// chains of deposited PDB entries. For instance for 6NHJ: with pure alignments it
// takes ~ 6 hours, with entity id comparisons it takes 2 minutes.
clusters.remove(c2);
} else if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
clusters.remove(c2);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class SubunitClustererParameters implements Serializable {
private double sequenceIdentityThreshold;
private double sequenceCoverageThreshold = 0.75;

private boolean useEntityIdForSeqIdentityDetermination = false;

private double rmsdThreshold = 3.0;
private double structureCoverageThreshold = 0.75;
private double tmThreshold = 0.5;
Expand Down Expand Up @@ -506,5 +508,21 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo
return sequenceIdentity>=hcSequenceIdentityLocal && sequenceCoverage >= hcSequenceCoverageLocal;
}

/**
* Whether to use the entity id of subunits to infer that sequences are identical.
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
* @return
*/
public boolean isUseEntityIdForSeqIdentityDetermination() {
return useEntityIdForSeqIdentityDetermination;
}

/**
* Whether to use the entity id of subunits to infer that sequences are identical.
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
* @param useEntityIdForSeqIdentityDetermination the flag to be set
*/
public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) {
this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ public class TestSubunitCluster {
@Test
public void testMergeIdentical() {

// Create an Atom Array of ploy-alanine
List<Atom> atoms = new ArrayList<Atom>(10);
// Create an Atom Array of poly-alanine
List<Atom> atoms = new ArrayList<>(10);
for (int i = 0; i < 10; i++) {
Group g = new AminoAcidImpl();
g.setPDBName("ALA");
Expand All @@ -79,7 +79,7 @@ public void testMergeIdentical() {
assertEquals(sc1.length(), 10);

// Create an Atom Array of poly-glycine
List<Atom> atoms2 = new ArrayList<Atom>(10);
List<Atom> atoms2 = new ArrayList<>(10);
for (int i = 0; i < 10; i++) {
Group g = new AminoAcidImpl();
g.setPDBName("GLY");
Expand Down Expand Up @@ -112,7 +112,7 @@ public void testMergeIdentical() {
public void testMergeSequence() throws CompoundNotFoundException {

// Create an Atom Array of ploy-alanine
List<Atom> atoms = new ArrayList<Atom>(100);
List<Atom> atoms = new ArrayList<>(100);
for (int i = 0; i < 100; i++) {
Group g = new AminoAcidImpl();
g.setPDBName("ALA");
Expand Down Expand Up @@ -163,7 +163,7 @@ public void testMergeSequence() throws CompoundNotFoundException {
assertEquals(sc1.length(), 100);

// Create an Atom Array of 9 glycine and 91 alanine
List<Atom> atoms3 = new ArrayList<Atom>(100);
List<Atom> atoms3 = new ArrayList<>(100);
for (int i = 0; i < 9; i++) {
Group g = new AminoAcidImpl();
g.setPDBName("GLY");
Expand Down