Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@
package org.biojava.nbio.structure.asa;

import org.biojava.nbio.structure.*;
import org.biojava.nbio.structure.contact.Contact;
import org.biojava.nbio.structure.contact.Grid;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.vecmath.Point3d;
import java.util.ArrayList;
import java.util.TreeMap;
import java.util.*;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor point, but maybe we should try to avoid wildcard imports?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an IntelliJ feature. I do like individual imports better too... I'll have to check if I can configure IntelliJ to do it.

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

Expand All @@ -36,28 +37,34 @@
/**
* Class to calculate Accessible Surface Areas based on
* the rolling ball algorithm by Shrake and Rupley.
*
* <p>
* The code is adapted from a python implementation at http://boscoh.com/protein/asapy
* (now source is available at https://github.com/boscoh/asa).
* Thanks to Bosco K. Ho for a great piece of code and for his fantastic blog.
*
* <p>
* See
* Shrake, A., and J. A. Rupley. "Environment and Exposure to Solvent of Protein Atoms.
* Lysozyme and Insulin." JMB (1973) 79:351-371.
* Lee, B., and Richards, F.M. "The interpretation of Protein Structures: Estimation of
* Static Accessibility" JMB (1971) 55:379-400
* @author duarte_j
*
* @author Jose Duarte
*
*/
public class AsaCalculator {

private static final Logger logger = LoggerFactory.getLogger(AsaCalculator.class);

// Bosco uses as default 960, Shrake and Rupley seem to use in their paper 92 (not sure if this is actually the same parameter)
public static final int DEFAULT_N_SPHERE_POINTS = 960;
/**
* The default value for number of sphere points to sample.
* See this paper for a nice study on the effect of this parameter: https://f1000research.com/articles/5-189/v1
*/
public static final int DEFAULT_N_SPHERE_POINTS = 1000;
public static final double DEFAULT_PROBE_SIZE = 1.4;
public static final int DEFAULT_NTHREADS = 1;

private static final boolean DEFAULT_USE_SPATIAL_HASHING = true;



// Chothia's amino acid atoms vdw radii
Expand Down Expand Up @@ -101,15 +108,19 @@ public void run() {
private int nThreads;
private Point3d[] spherePoints;
private double cons;
private int[][] neighborIndices;

private boolean useSpatialHashingForNeighbors;

/**
* Constructs a new AsaCalculator. Subsequently call {@link #calculateAsas()}
* or {@link #getGroupAsas()} to calculate the ASAs
* Only non-Hydrogen atoms are considered in the calculation.
* @param structure
* @param probe
* @param nSpherePoints
* @param nThreads
* @param structure the structure, all non-H atoms will be used
* @param probe the probe size
* @param nSpherePoints the number of points to be used in generating the spherical
* dot-density, the more points the more accurate (and slower) calculation
* @param nThreads the number of parallel threads to use for the calculation
* @param hetAtoms if true HET residues are considered, if false they aren't, equivalent to
* NACCESS' -h option
* @see StructureTools#getAllNonHAtomArray
Expand All @@ -120,16 +131,15 @@ public AsaCalculator(Structure structure, double probe, int nSpherePoints, int n
this.probe = probe;
this.nThreads = nThreads;

this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;

// initialising the radii by looking them up through AtomRadii
radii = new double[atomCoords.length];
for (int i=0;i<atomCoords.length;i++) {
radii[i] = getRadius(atoms[i]);
}

// initialising the sphere points to sample
spherePoints = generateSpherePoints(nSpherePoints);

cons = 4.0 * Math.PI / nSpherePoints;
initSpherePoints(nSpherePoints);
}

/**
Expand All @@ -148,6 +158,8 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads
this.probe = probe;
this.nThreads = nThreads;

this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;

for (Atom atom:atoms) {
if (atom.getElement()==Element.H)
throw new IllegalArgumentException("Can't calculate ASA for an array that contains Hydrogen atoms ");
Expand All @@ -159,10 +171,7 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads
radii[i] = getRadius(atoms[i]);
}

// initialising the sphere points to sample
spherePoints = generateSpherePoints(nSpherePoints);

cons = 4.0 * Math.PI / nSpherePoints;
initSpherePoints(nSpherePoints);
}

/**
Expand All @@ -188,12 +197,21 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int
this.probe = probe;
this.nThreads = nThreads;

this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;

// initialising the radii to the given radius for all atoms
radii = new double[atomCoords.length];
for (int i=0;i<atomCoords.length;i++) {
radii[i] = radius;
}

initSpherePoints(nSpherePoints);
}

private void initSpherePoints(int nSpherePoints) {

logger.debug("Will use {} sphere points", nSpherePoints);

// initialising the sphere points to sample
spherePoints = generateSpherePoints(nSpherePoints);

Expand All @@ -209,7 +227,7 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int
*/
public GroupAsa[] getGroupAsas() {

TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<ResidueNumber, GroupAsa>();
TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<>();

double[] asasPerAtom = calculateAsas();

Expand Down Expand Up @@ -238,12 +256,26 @@ public double[] calculateAsas() {

double[] asas = new double[atomCoords.length];

long start = System.currentTimeMillis();
if (useSpatialHashingForNeighbors) {
logger.debug("Will use spatial hashing to find neighbors");
neighborIndices = findNeighborIndicesSpatialHashing();
} else {
logger.debug("Will not use spatial hashing to find neighbors");
neighborIndices = findNeighborIndices();
}
long end = System.currentTimeMillis();
logger.debug("Took {} s to find neighbors", (end-start)/1000.0);

start = System.currentTimeMillis();
if (nThreads<=1) { // (i.e. it will also be 1 thread if 0 or negative number specified)
logger.debug("Will use 1 thread for ASA calculation");
for (int i=0;i<atomCoords.length;i++) {
asas[i] = calcSingleAsa(i);
}

} else {
logger.debug("Will use {} threads for ASA calculation", nThreads);
// NOTE the multithreaded calculation does not scale up well in some systems,
// why? I guess some memory/garbage collect problem? I tried increasing Xmx in pc8201 but didn't help

Expand Down Expand Up @@ -293,10 +325,22 @@ public double[] calculateAsas() {
while (!threadPool.isTerminated());

}
end = System.currentTimeMillis();
logger.debug("Took {} s to calculate all {} atoms ASAs (excluding neighbors calculation)", (end-start)/1000.0, atomCoords.length);
Copy link

@ghost ghost Jan 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd probably keep invoke this function from a test and time and log there, eventually move it to a benchmark test suite.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My use case was testing other software that uses ASA calculation deep into some function call. I thought this would be a good way of debugging it. Do you see potential performance problems with this?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I don't see any real issue. I personally tend to move code from main to tests whenever possible, simply to reduce clutter in the real code.

currentTimeMillis probably cost nothing and the other computation in the logging steps seems minimal. I'm transitioning back to java, so I'm not an expert in java just yet.

Feel free to ignore my comments here. I'm mostly writing for myself to get my head into some of biojava's code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kanishka-azimi Thanks for contributing! The more people reviewing PRs the better!


return asas;
}

/**
* Set the useSpatialHashingForNeighbors flag to use spatial hashing to calculate neighbors (true) or all-to-all
* distance calculation (false). Default is {@value DEFAULT_USE_SPATIAL_HASHING}.
* Use for testing performance only.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For small molecules is the old algorithm faster? If so maybe document that here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance is essentially the same for small molecules. The gain is very noticeable in very large molecules.

* @param useSpatialHashingForNeighbors the flag
*/
void setUseSpatialHashingForNeighbors(boolean useSpatialHashingForNeighbors) {
this.useSpatialHashingForNeighbors = useSpatialHashingForNeighbors;
}

/**
* Returns list of 3d coordinates of points on a sphere using the
* Golden Section Spiral algorithm.
Expand All @@ -317,36 +361,118 @@ private Point3d[] generateSpherePoints(int nSpherePoints) {
}

/**
* Returns list of indices of atoms within probe distance to atom k.
* @param k index of atom for which we want neighbor indices
* Returns the 2-dimensional array with neighbor indices for every atom.
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adjacency list vs adjacency matrix...possible prefer one or the other universally based on expected density of graph

*/
private Integer[] findNeighborIndices(int k) {
int[][] findNeighborIndices() {

// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
// Thus 40 seems to be a good compromise for the starting capacity
ArrayList<Integer> neighbor_indices = new ArrayList<>(40);
int initialCapacity = 60;

double radius = radii[k] + probe + probe;
int[][] nbsIndices = new int[atomCoords.length][];

for (int i=0;i<atomCoords.length;i++) {
if (i==k) continue;
for (int k=0; k<atomCoords.length; k++) {
double radius = radii[k] + probe + probe;

List<Integer> thisNbIndices = new ArrayList<>(initialCapacity);

for (int i = 0; i < atomCoords.length; i++) {
if (i == k) continue;

double dist = atomCoords[i].distance(atomCoords[k]);

if (dist < radius + radii[i]) {
thisNbIndices.add(i);
}
}

int[] indicesArray = new int[thisNbIndices.size()];
for (int i=0;i<thisNbIndices.size();i++) indicesArray[i] = thisNbIndices.get(i);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very minor note. Since random access isn't being used in the array list, could use Queue interface instead of List.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some cute shortcuts - https://stackoverflow.com/questions/960431/how-to-convert-listinteger-to-int-in-java . I don't know if biojava has any general preference regarding guava or apache commons.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I like the answer that uses the stream API: https://stackoverflow.com/a/23945015/3914327

nbsIndices[k] = indicesArray;
}
return nbsIndices;
}

/**
* Returns the 2-dimensional array with neighbor indices for every atom,
* using spatial hashing to avoid all to all distance calculation.
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
*/
int[][] findNeighborIndicesSpatialHashing() {

double dist = atomCoords[i].distance(atomCoords[k]);
// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
int initialCapacity = 60;

List<Contact> contactList = calcContacts();
Map<Integer, List<Integer>> indices = new HashMap<>(atomCoords.length);
for (Contact contact : contactList) {
// note contacts are stored 1-way only, with j>i
int i = contact.getI();
int j = contact.getJ();

List<Integer> iIndices;
List<Integer> jIndices;
if (!indices.containsKey(i)) {
iIndices = new ArrayList<>(initialCapacity);
indices.put(i, iIndices);
} else {
iIndices = indices.get(i);
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: putIfAbsent

if (!indices.containsKey(j)) {
jIndices = new ArrayList<>(initialCapacity);
indices.put(j, jIndices);
} else {
jIndices = indices.get(j);
}

if (dist < radius + radii[i]) {
neighbor_indices.add(i);
double radius = radii[i] + probe + probe;
double dist = contact.getDistance();
if (dist < radius + radii[j]) {
iIndices.add(j);
jIndices.add(i);
}
}

// convert map to array for fast access
int[][] nbsIndices = new int[atomCoords.length][];
for (Map.Entry<Integer, List<Integer>> entry : indices.entrySet()) {
List<Integer> list = entry.getValue();
int[] indicesArray = new int[list.size()];
for (int i=0;i<entry.getValue().size();i++) indicesArray[i] = list.get(i);
nbsIndices[entry.getKey()] = indicesArray;
}

Integer[] indicesArray = new Integer[neighbor_indices.size()];
indicesArray = neighbor_indices.toArray(indicesArray);
return indicesArray;
return nbsIndices;
}

Point3d[] getAtomCoords() {
return atomCoords;
}

private List<Contact> calcContacts() {
double maxRadius = maxValue(radii);
double cutoff = maxRadius + maxRadius + probe + probe;
logger.debug("Max radius is {}, cutoff is {}", maxRadius, cutoff);
Grid grid = new Grid(cutoff);
grid.addCoords(atomCoords);
return grid.getIndicesContacts();
}

private static double maxValue(double[] array) {
double max = array[0];
for (int i = 0; i < array.length; i++) {
if (array[i] > max) {
max = array[i];
}
}
return max;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: some stream shortcut could replace this function.

}

private double calcSingleAsa(int i) {
Point3d atom_i = atomCoords[i];
Integer[] neighbor_indices = findNeighborIndices(i);
int n_neighbor = neighbor_indices.length;

int n_neighbor = neighborIndices[i].length;
int[] neighbor_indices = neighborIndices[i];
int j_closest_neighbor = 0;
double radius = probe + radii[i];

Expand Down Expand Up @@ -497,7 +623,7 @@ private static double getRadiusForNucl(NucleotideImpl nuc, Atom atom) {
*
* If atom is neither part of a nucleotide nor of a standard aminoacid,
* the default vdw radius for the element is returned. If atom is of
* unknown type (element) the vdw radius of {@link #Element().N} is returned
* unknown type (element) the vdw radius of {@link Element().N} is returned
*
* @param atom
* @return
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ protected void setAsas(double[] asas1, double[] asas2, int nSpherePoints, int nT
throw new IllegalArgumentException("The size of ASAs of complex doesn't match that of ASAs 1 + ASAs 2");


groupAsas1 = new TreeMap<ResidueNumber, GroupAsa>();
groupAsas2 = new TreeMap<ResidueNumber, GroupAsa>();
groupAsas1 = new TreeMap<>();
groupAsas2 = new TreeMap<>();

this.totalArea = 0;

Expand Down
Loading