Skip to content

Commit df22da3

Browse files
author
Dmytro Guzenko
committed
Added comments/docs following the review.
1 parent 996c149 commit df22da3

File tree

6 files changed

+85
-29
lines changed

6 files changed

+85
-29
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/alignment/SimpleSequencePair.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,15 @@ public AlignedSequence<S, C> getTarget() {
207207
return getAlignedSequence(2);
208208
}
209209

210+
/**
211+
* Returns the percentage of identity between the two sequences in the alignment as a fraction between 0 and 1.
212+
*
213+
* @param countGaps
214+
* If true, gap positions are counted as mismatches, i.e., the percentage is normalized by the alignment length.
215+
* If false, gap positions are not counted, i.e. the percentage is normalized by the number of aligned residue pairs.
216+
* See May (2004). "Percent sequence identity: the need to be explicit."
217+
* @return the percentage of sequence identity as a fraction in [0,1]
218+
*/
210219
@Override
211220
public double getPercentageOfIdentity(boolean countGaps) {
212221
double seqid = getNumIdenticals();

biojava-core/src/main/java/org/biojava/nbio/core/alignment/template/SequencePair.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,11 @@ public interface SequencePair<S extends Sequence<C>, C extends Compound> extends
9999

100100
/**
101101
* Returns the percentage of identity between the two sequences in the alignment as a fraction between 0 and 1.
102-
* This is equivalent to {@link #getNumIdenticals()} / {@link #getLength()}.
103-
* @param countGaps
104-
* If true, gap positions are included into the calculation.
105102
*
103+
* @param countGaps
104+
* If true, gap positions are counted as mismatches, i.e., the percentage is normalized by the alignment length.
105+
* If false, gap positions are not counted, i.e. the percentage is normalized by the number of aligned residue pairs.
106+
* See May (2004). "Percent sequence identity: the need to be explicit."
106107
* @return the percentage of sequence identity as a fraction in [0,1]
107108
*/
108109
double getPercentageOfIdentity(boolean countGaps);

biojava-structure/src/main/java/org/biojava/nbio/structure/Calc.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,8 +1282,7 @@ public static double rmsd(Atom[] x, Atom[] y) {
12821282
* @return The TM-Score
12831283
* @throws StructureException
12841284
*/
1285-
public static double getTMScore(Atom[] atomSet1, Atom[] atomSet2,
1286-
int len1, int len2) throws StructureException {
1285+
public static double getTMScore(Atom[] atomSet1, Atom[] atomSet2, int len1, int len2) throws StructureException {
12871286
return getTMScore(atomSet1, atomSet2, len1, len2,true);
12881287
}
12891288

biojava-structure/src/main/java/org/biojava/nbio/structure/align/ce/CeParameters.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,11 @@ public String toString() {
7171
protected boolean showAFPRanges;
7272
protected int sideChainScoringType;
7373

74-
// whether to perform final optimisation of the alignment (slow),
75-
// which "attempts to increase the alignment length
76-
// found previously while keeping the r.m.s.d. at
77-
// about the same level" (Shindyalov and Bourne, 1998)
78-
// may not be needed for some applications
74+
/**
75+
* Whether the CE algorithm should extend the best found trace with dynamic programming,
76+
* while keeping RMSD at about the same level. (Shindyalov and Bourne, 1998)
77+
* This is useful for edge cases with remote homology, but can be slow for large structures.
78+
*/
7979
private boolean optimizeAlignment;
8080

8181
protected static final double DEFAULT_GAP_OPEN = 5.0;
@@ -425,10 +425,24 @@ public void setSubstitutionMatrix(
425425
}
426426

427427

428+
/**
429+
* Whether the CE algorithm should extend the best found trace with dynamic programming,
430+
* while keeping RMSD at about the same level. This is useful for edge cases with remote homology,
431+
* but can be slow for large structures.
432+
*
433+
* @return optimizeAlignment
434+
*/
428435
public boolean isOptimizeAlignment() {
429436
return optimizeAlignment;
430437
}
431438

439+
/**
440+
* Whether the CE algorithm should extend the best found trace with dynamic programming,
441+
* while keeping RMSD at about the same level. This is useful for edge cases with remote homology,
442+
* but can be slow for large structures.
443+
*
444+
* @param optimizeAlignment
445+
*/
432446
public void setOptimizeAlignment(boolean optimizeAlignment) {
433447
this.optimizeAlignment = optimizeAlignment;
434448
}

biojava-structure/src/main/java/org/biojava/nbio/structure/align/fatcat/calc/FatCatParameters.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,10 +456,24 @@ private Object getValue(String name){
456456

457457
}
458458

459+
/**
460+
* Whether the alignment algorithm should try its best to optimize the alignment,
461+
* or we are happy with a quick and dirty result.
462+
* NB: Not implemented in jFatCat
463+
*
464+
* @return optimizeAlignment
465+
*/
459466
public boolean isOptimizeAlignment() {
460467
return optimizeAlignment;
461468
}
462469

470+
/**
471+
* Whether the alignment algorithm should try its best to optimize the alignment,
472+
* or we are happy with a quick and dirty result.
473+
* NB: Not implemented in jFatCat
474+
*
475+
* @param optimizeAlignment
476+
*/
463477
public void setOptimizeAlignment(boolean optimizeAlignment) {
464478
this.optimizeAlignment = optimizeAlignment;
465479
}

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,6 @@ public class SubunitClustererParameters implements Serializable {
4141
private int absoluteMinimumSequenceLength = 5;
4242
private double minimumSequenceLengthFraction = 0.75;
4343

44-
// "local" metrics are separately scoring
45-
// SubunitClustererMethod.SEQUENCE: sequence identity of a local alignment (normalised by the number of
46-
// aligned residues) and sequence coverage of the alignment
47-
// SubunitClustererMethod.STRUCTURE: RMSD of the aligned substructures and structure coverage of the alignment
48-
// require two respective thresholds for each method
49-
50-
// "global" metrics are scoring
51-
// SubunitClustererMethod.SEQUENCE: sequence identity of a global alignment (normalised by the number of columns)
52-
// SubunitClustererMethod.STRUCTURE: TMScore of the aligned structures (normalised by the larger structure)
53-
// require one threshold for each method
5444
private boolean useGlobalMetrics;
5545
private double sequenceIdentityThreshold;
5646
private double sequenceCoverageThreshold = 0.75;
@@ -71,17 +61,34 @@ public class SubunitClustererParameters implements Serializable {
7161

7262
private boolean internalSymmetry = false;
7363

74-
// subunits aligned with these or better scores will be considered "identical"
64+
/**
65+
* Subunits aligned with these or better scores will be considered "identical".
66+
*/
7567
private static final double hcSequenceIdentityLocal = 0.95;
7668
private static final double hcSequenceCoverageLocal = 0.75;
7769
private static final double hcSequenceIdentityGlobal = 0.85;
7870

79-
public SubunitClustererParameters() {
80-
this(false); // initialize with local metrics by default
81-
}
82-
83-
public SubunitClustererParameters(boolean iUseGlobalMetrics) {
84-
useGlobalMetrics = iUseGlobalMetrics;
71+
/**
72+
* "Local" metrics are scoring
73+
* SubunitClustererMethod.SEQUENCE: sequence identity of a local alignment
74+
* (normalised by the number of aligned residues)
75+
* sequence coverage of the alignment
76+
* (normalised by the length of the longer sequence)
77+
* SubunitClustererMethod.STRUCTURE: RMSD of the aligned substructures
78+
* and structure coverage of the alignment
79+
* (normalised by the length of the larger structure)
80+
* Two thresholds for each method are required.
81+
*
82+
* "Global" metrics are scoring
83+
* SubunitClustererMethod.SEQUENCE: sequence identity of a global alignment
84+
* (normalised by the length of the alignment)
85+
* SubunitClustererMethod.STRUCTURE: TMScore of the aligned structures
86+
* (normalised by the length of the larger structure)
87+
* One threshold for each method is required.
88+
*
89+
*/
90+
public SubunitClustererParameters(boolean useGlobalMetrics) {
91+
this.useGlobalMetrics = useGlobalMetrics;
8592

8693
if (useGlobalMetrics) {
8794
sequenceIdentityThreshold = hcSequenceIdentityGlobal;
@@ -97,6 +104,14 @@ public SubunitClustererParameters(boolean iUseGlobalMetrics) {
97104
useTMScore = false;
98105
}
99106
}
107+
108+
/**
109+
* Initialize with "local" metrics by default.
110+
*/
111+
public SubunitClustererParameters() {
112+
this(false);
113+
}
114+
100115
/**
101116
* Get the minimum number of residues of a subunits to be considered in the
102117
* clusters.
@@ -473,12 +488,16 @@ public void setUseGlobalMetrics(boolean useGlobalMetrics) {
473488
}
474489

475490
/**
476-
* Whether the subunits can be considered "identical" by sequence alignment
491+
* Whether the subunits can be considered "identical" by sequence alignment.
492+
* For local sequence alignment (normalized by the number of aligned pairs)
493+
* this means 0.95 or higher identity and 0.75 or higher coverage.
494+
* For global sequence alignment (normalised by the alignment length)
495+
* this means 0.85 or higher sequence identity.
477496
*
478497
* @param sequenceIdentity
479498
* @param sequenceCoverage
480499
* @return true if the sequence alignment scores are equal to
481-
* or better than the predefined "high confidence" scores, false otherwise.
500+
* or better than the "high confidence" scores, false otherwise.
482501
*/
483502
public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCoverage) {
484503
if (useGlobalMetrics)

0 commit comments

Comments
 (0)