biojava · josemduarte · Jan 12, 2020 · Dec 12, 2019 · Dec 27, 2019 · Jan 5, 2020
diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java
@@ -31,6 +31,8 @@
 import org.biojava.nbio.core.sequence.ProteinSequence;
 import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
 import org.biojava.nbio.structure.Atom;
+import org.biojava.nbio.structure.Chain;
+import org.biojava.nbio.structure.Structure;
 import org.biojava.nbio.structure.StructureException;
 import org.biojava.nbio.structure.align.StructureAlignment;
 import org.biojava.nbio.structure.align.StructureAlignmentFactory;
@@ -45,6 +47,7 @@
 import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl;
 import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer;
 import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer;
+import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
 import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits;
 import org.biojava.nbio.structure.symmetry.internal.CESymmParameters;
 import org.biojava.nbio.structure.symmetry.internal.CeSymm;
@@ -71,11 +74,10 @@
  */
 public class SubunitCluster {
 
-	private static final Logger logger = LoggerFactory
-			.getLogger(SubunitCluster.class);
+	private static final Logger logger = LoggerFactory.getLogger(SubunitCluster.class);
 
-	private List<Subunit> subunits = new ArrayList<Subunit>();
-	private List<List<Integer>> subunitEQR = new ArrayList<List<Integer>>();
+	private List<Subunit> subunits = new ArrayList<>();
+	private List<List<Integer>> subunitEQR = new ArrayList<>();
 	private int representative = -1;
 
 	private SubunitClustererMethod method = SubunitClustererMethod.SEQUENCE;
@@ -119,7 +121,7 @@ public SubunitCluster(Subunit subunit) {
 
 		subunits.add(subunit);
 
-		List<Integer> identity = new ArrayList<Integer>();
+		List<Integer> identity = new ArrayList<>();
 		for (int i = 0; i < subunit.size(); i++)
 			identity.add(i);
 		subunitEQR.add(identity);
@@ -179,6 +181,38 @@ public boolean isIdenticalTo(SubunitCluster other) {
 		return thisSequence.equals(otherSequence);
 	}
 
+	/**
+	 * Tells whether the other SubunitCluster contains exactly the same Subunit.
+	 * This is checked by equality of their entity identifiers if they are present.
+	 *
+	 * @param other
+	 *            SubunitCluster
+	 * @return true if the SubunitClusters are identical, false otherwise
+	 */
+	public boolean isIdenticalByEntityIdTo(SubunitCluster other) {
+		Structure thisStruct = this.subunits.get(this.representative).getStructure();
+		Structure otherStruct = other.subunits.get(other.representative).getStructure();
+		String thisName = this.subunits.get(this.representative).getName();
+		String otherName = other.subunits.get(this.representative).getName();
+		Chain thisChain = thisStruct.getChain(thisName);
+		Chain otherChain = otherStruct.getChain(otherName);
+		if (thisChain == null || otherChain == null) {
+			logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
+					this.subunits.get(this.representative).getName(),
+					other.subunits.get(other.representative).getName());
+			return false;
+		}
+		if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) {
+			logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
+					this.subunits.get(this.representative).getName(),
+					other.subunits.get(other.representative).getName());
+			return false;
+		}
+		int thisEntityId = thisChain.getEntityInfo().getMolId();
+		int otherEntityId = otherChain.getEntityInfo().getMolId();
+		return thisEntityId == otherEntityId;
+	}
+
 	/**
 	 * Merges the other SubunitCluster into this one if it contains exactly the
 	 * same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}.
@@ -192,7 +226,35 @@ public boolean mergeIdentical(SubunitCluster other) {
 		if (!isIdenticalTo(other))
 			return false;
 
-		logger.info("SubunitClusters are identical");
+		logger.info("SubunitClusters {}-{} are identical in sequence",
+				this.subunits.get(this.representative).getName(),
+				other.subunits.get(other.representative).getName());
+
+		this.subunits.addAll(other.subunits);
+		this.subunitEQR.addAll(other.subunitEQR);
+
+		return true;
+	}
+
+	/**
+	 * Merges the other SubunitCluster into this one if it contains exactly the
+	 * same Subunit. This is checked by comparing the entity identifiers of the subunits
+	 * if one can be found.
+	 * Thus this only makes sense when the subunits are complete chains of a
+	 * deposited PDB entry. I
+	 *
+	 * @param other
+	 *            SubunitCluster
+	 * @return true if the SubunitClusters were merged, false otherwise
+	 */
+	public boolean mergeIdenticalByEntityId(SubunitCluster other) {
+
+		if (!isIdenticalByEntityIdTo(other))
+			return false;
+
+		logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical",
+				this.subunits.get(this.representative).getName(),
+				other.subunits.get(other.representative).getName());
 
 		this.subunits.addAll(other.subunits);
 		this.subunitEQR.addAll(other.subunitEQR);
@@ -296,13 +358,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa
 				return false;
 		}
 
-		logger.info(String.format("SubunitClusters are similar in sequence "
-						+ "with %.2f sequence identity and %.2f coverage", sequenceIdentity,
-				sequenceCoverage));
+		logger.info(String.format("SubunitClusters %s-%s are similar in sequence "
+						+ "with %.2f sequence identity and %.2f coverage",
+				this.subunits.get(this.representative).getName(),
+				other.subunits.get(other.representative).getName(),
+				sequenceIdentity, sequenceCoverage));
 
 		// If coverage and sequence identity sufficient, merge other and this
-		List<Integer> thisAligned = new ArrayList<Integer>();
-		List<Integer> otherAligned = new ArrayList<Integer>();
+		List<Integer> thisAligned = new ArrayList<>();
+		List<Integer> otherAligned = new ArrayList<>();
 
 		// Extract the aligned residues of both Subunit
 		for (int p = 1; p < aligner.getPair().getLength() + 1; p++) {
@@ -318,60 +382,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa
 
 			// Only consider residues that are part of the SubunitCluster
 			if (this.subunitEQR.get(this.representative).contains(thisIndex)
-					&& other.subunitEQR.get(other.representative).contains(
-							otherIndex)) {
+					&& other.subunitEQR.get(other.representative).contains(otherIndex)) {
 				thisAligned.add(thisIndex);
 				otherAligned.add(otherIndex);
 			}
 		}
 
-		// Do a List intersection to find out which EQR columns to remove
-		List<Integer> thisRemove = new ArrayList<Integer>();
-		List<Integer> otherRemove = new ArrayList<Integer>();
-
-		for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
-			// If the index is aligned do nothing, otherwise mark as removing
-			if (!thisAligned.contains(this.subunitEQR.get(this.representative)
-					.get(t)))
-				thisRemove.add(t);
-		}
-
-		for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
-			// If the index is aligned do nothing, otherwise mark as removing
-			if (!otherAligned.contains(other.subunitEQR.get(
-					other.representative).get(t)))
-				otherRemove.add(t);
-		}
-		// Now remove unaligned columns, from end to start
-		Collections.sort(thisRemove);
-		Collections.reverse(thisRemove);
-		Collections.sort(otherRemove);
-		Collections.reverse(otherRemove);
-
-		for (int t = 0; t < thisRemove.size(); t++) {
-			for (List<Integer> eqr : this.subunitEQR) {
-				int column = thisRemove.get(t);
-				eqr.remove(column);
-			}
-		}
-
-		for (int t = 0; t < otherRemove.size(); t++) {
-			for (List<Integer> eqr : other.subunitEQR) {
-				int column = otherRemove.get(t);
-				eqr.remove(column);
-			}
-		}
-
-		// The representative is the longest sequence
-		if (this.subunits.get(this.representative).size() < other.subunits.get(
-				other.representative).size())
-			this.representative = other.representative + subunits.size();
-
-		this.subunits.addAll(other.subunits);
-		this.subunitEQR.addAll(other.subunitEQR);
+		updateEquivResidues(other, thisAligned, otherAligned);
 
 		this.method = SubunitClustererMethod.SEQUENCE;
-
 		pseudoStoichiometric = !params.isHighConfidenceScores(sequenceIdentity,sequenceCoverage);
 
 		return true;
@@ -445,8 +464,8 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
 
 		// Merge clusters
 		List<List<Integer>> alignedRes = msa.getBlock(0).getAlignRes();
-		List<Integer> thisAligned = new ArrayList<Integer>();
-		List<Integer> otherAligned = new ArrayList<Integer>();
+		List<Integer> thisAligned = new ArrayList<>();
+		List<Integer> otherAligned = new ArrayList<>();
 
 		// Extract the aligned residues of both Subunit
 		for (int p = 0; p < msa.length(); p++) {
@@ -469,24 +488,30 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
 			}
 		}
 
+		updateEquivResidues(other, thisAligned, otherAligned);
+
+		this.method = SubunitClustererMethod.STRUCTURE;
+		pseudoStoichiometric = true;
+
+		return true;
+	}
+
+	private void updateEquivResidues(SubunitCluster other, List<Integer> thisAligned, List<Integer> otherAligned) {
 		// Do a List intersection to find out which EQR columns to remove
-		List<Integer> thisRemove = new ArrayList<Integer>();
-		List<Integer> otherRemove = new ArrayList<Integer>();
+		List<Integer> thisRemove = new ArrayList<>();
+		List<Integer> otherRemove = new ArrayList<>();
 
 		for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
 			// If the index is aligned do nothing, otherwise mark as removing
-			if (!thisAligned.contains(this.subunitEQR.get(this.representative)
-					.get(t)))
+			if (!thisAligned.contains(this.subunitEQR.get(this.representative).get(t)))
 				thisRemove.add(t);
 		}
 
 		for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
 			// If the index is aligned do nothing, otherwise mark as removing
-			if (!otherAligned.contains(other.subunitEQR.get(
-					other.representative).get(t)))
+			if (!otherAligned.contains(other.subunitEQR.get(other.representative).get(t)))
 				otherRemove.add(t);
 		}
-
 		// Now remove unaligned columns, from end to start
 		Collections.sort(thisRemove);
 		Collections.reverse(thisRemove);
@@ -508,17 +533,12 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
 		}
 
 		// The representative is the longest sequence
-		if (this.subunits.get(this.representative).size() < other.subunits.get(
-				other.representative).size())
+		if (this.subunits.get(this.representative).size() < other.subunits.get(other.representative).size())
 			this.representative = other.representative + subunits.size();
 
 		this.subunits.addAll(other.subunits);
 		this.subunitEQR.addAll(other.subunitEQR);
 
-		this.method = SubunitClustererMethod.STRUCTURE;
-		pseudoStoichiometric = true;
-
-		return true;
 	}
 
 	/**
@@ -568,9 +588,9 @@ public boolean divideInternally(SubunitClustererParameters clusterParams)
 		List<List<Integer>> alignedRes = result.getMultipleAlignment()
 				.getBlock(0).getAlignRes();
 
-		List<List<Integer>> columns = new ArrayList<List<Integer>>();
+		List<List<Integer>> columns = new ArrayList<>();
 		for (int s = 0; s < alignedRes.size(); s++)
-			columns.add(new ArrayList<Integer>(alignedRes.get(s).size()));
+			columns.add(new ArrayList<>(alignedRes.get(s).size()));
 
 		// Extract the aligned columns of each repeat in the Subunit
 		for (int col = 0; col < alignedRes.get(0).size(); col++) {

diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java
@@ -56,12 +56,8 @@ public static Stoichiometry cluster(Structure structure,
 		return cluster(subunits, params);
 	}
 
-	public static Stoichiometry cluster(List<Subunit> subunits,
-			SubunitClustererParameters params) {
-
-		// The collection of clusters to return
-		List<SubunitCluster> clusters = new ArrayList<SubunitCluster>();
-
+	public static Stoichiometry cluster(List<Subunit> subunits, SubunitClustererParameters params) {
+		List<SubunitCluster> clusters = new ArrayList<>();
 		if (subunits.size() == 0)
 			return new Stoichiometry(clusters);
 
@@ -75,7 +71,14 @@ public static Stoichiometry cluster(List<Subunit> subunits,
 			for (int c1 = 0; c1 < clusters.size(); c1++) {
 				for (int c2 = clusters.size() - 1; c2 > c1; c2--) {
 					try {
-						if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
+						if (params.isUseEntityIdForSeqIdentityDetermination() &&
+								clusters.get(c1).mergeIdenticalByEntityId(clusters.get(c2))) {
+							// This we will only do if the switch is for entity id comparison is on.
+							// In some cases it can save enormous amounts of time, e.g. for clustering full
+							// chains of deposited PDB entries. For instance for 6NHJ: with pure alignments it
+							// takes ~ 6 hours, with entity id comparisons it takes 2 minutes.
+							clusters.remove(c2);
+						} else if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
 							clusters.remove(c2);
 						}
 

diff --git a/...tructure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java b/...tructure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java
@@ -45,6 +45,8 @@ public class SubunitClustererParameters implements Serializable {
 	private double sequenceIdentityThreshold;
 	private double sequenceCoverageThreshold = 0.75;
 
+	private boolean useEntityIdForSeqIdentityDetermination = false;
+
 	private double rmsdThreshold = 3.0;
 	private double structureCoverageThreshold = 0.75;
 	private double tmThreshold = 0.5;
@@ -506,5 +508,21 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo
 			return sequenceIdentity>=hcSequenceIdentityLocal && sequenceCoverage >= hcSequenceCoverageLocal;
 	}
 
+	/**
+	 * Whether to use the entity id of subunits to infer that sequences are identical.
+	 * Only applies if the {@link SubunitClustererMethod} is a sequence based one.
+	 * @return
+	 */
+	public boolean isUseEntityIdForSeqIdentityDetermination() {
+		return useEntityIdForSeqIdentityDetermination;
+	}
 
+	/**
+	 * Whether to use the entity id of subunits to infer that sequences are identical.
+	 * Only applies if the {@link SubunitClustererMethod} is a sequence based one.
+	 * @param useEntityIdForSeqIdentityDetermination the flag to be set
+	 */
+	public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) {
+		this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination;
+	}
 }
diff --git a/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java b/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java
@@ -52,8 +52,8 @@ public class TestSubunitCluster {
 	@Test
 	public void testMergeIdentical() {
 
-		// Create an Atom Array of ploy-alanine
-		List<Atom> atoms = new ArrayList<Atom>(10);
+		// Create an Atom Array of poly-alanine
+		List<Atom> atoms = new ArrayList<>(10);
 		for (int i = 0; i < 10; i++) {
 			Group g = new AminoAcidImpl();
 			g.setPDBName("ALA");
@@ -79,7 +79,7 @@ public void testMergeIdentical() {
 		assertEquals(sc1.length(), 10);
 
 		// Create an Atom Array of poly-glycine
-		List<Atom> atoms2 = new ArrayList<Atom>(10);
+		List<Atom> atoms2 = new ArrayList<>(10);
 		for (int i = 0; i < 10; i++) {
 			Group g = new AminoAcidImpl();
 			g.setPDBName("GLY");
@@ -112,7 +112,7 @@ public void testMergeIdentical() {
 	public void testMergeSequence() throws CompoundNotFoundException {
 
 		// Create an Atom Array of ploy-alanine
-		List<Atom> atoms = new ArrayList<Atom>(100);
+		List<Atom> atoms = new ArrayList<>(100);
 		for (int i = 0; i < 100; i++) {
 			Group g = new AminoAcidImpl();
 			g.setPDBName("ALA");
@@ -163,7 +163,7 @@ public void testMergeSequence() throws CompoundNotFoundException {
 		assertEquals(sc1.length(), 100);
 
 		// Create an Atom Array of 9 glycine and 91 alanine
-		List<Atom> atoms3 = new ArrayList<Atom>(100);
+		List<Atom> atoms3 = new ArrayList<>(100);
 		for (int i = 0; i < 9; i++) {
 			Group g = new AminoAcidImpl();
 			g.setPDBName("GLY");