-
Notifications
You must be signed in to change notification settings - Fork 397
Improved ASA calculation performance #820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c213a7d
3b99afd
a84609b
1100299
ddeb759
ede5cee
041482d
f42a0fa
373a407
b674136
8fb10fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,13 @@ | |
| package org.biojava.nbio.structure.asa; | ||
|
|
||
| import org.biojava.nbio.structure.*; | ||
| import org.biojava.nbio.structure.contact.Contact; | ||
| import org.biojava.nbio.structure.contact.Grid; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import javax.vecmath.Point3d; | ||
| import java.util.ArrayList; | ||
| import java.util.TreeMap; | ||
| import java.util.*; | ||
| import java.util.concurrent.ExecutorService; | ||
| import java.util.concurrent.Executors; | ||
|
|
||
|
|
@@ -36,28 +37,34 @@ | |
| /** | ||
| * Class to calculate Accessible Surface Areas based on | ||
| * the rolling ball algorithm by Shrake and Rupley. | ||
| * | ||
| * <p> | ||
| * The code is adapted from a python implementation at http://boscoh.com/protein/asapy | ||
| * (now source is available at https://github.com/boscoh/asa). | ||
| * Thanks to Bosco K. Ho for a great piece of code and for his fantastic blog. | ||
| * | ||
| * <p> | ||
| * See | ||
| * Shrake, A., and J. A. Rupley. "Environment and Exposure to Solvent of Protein Atoms. | ||
| * Lysozyme and Insulin." JMB (1973) 79:351-371. | ||
| * Lee, B., and Richards, F.M. "The interpretation of Protein Structures: Estimation of | ||
| * Static Accessibility" JMB (1971) 55:379-400 | ||
| * @author duarte_j | ||
| * | ||
| * @author Jose Duarte | ||
| * | ||
| */ | ||
| public class AsaCalculator { | ||
|
|
||
| private static final Logger logger = LoggerFactory.getLogger(AsaCalculator.class); | ||
|
|
||
| // Bosco uses as default 960, Shrake and Rupley seem to use in their paper 92 (not sure if this is actually the same parameter) | ||
| public static final int DEFAULT_N_SPHERE_POINTS = 960; | ||
| /** | ||
| * The default value for number of sphere points to sample. | ||
| * See this paper for a nice study on the effect of this parameter: https://f1000research.com/articles/5-189/v1 | ||
| */ | ||
| public static final int DEFAULT_N_SPHERE_POINTS = 1000; | ||
| public static final double DEFAULT_PROBE_SIZE = 1.4; | ||
| public static final int DEFAULT_NTHREADS = 1; | ||
|
|
||
| private static final boolean DEFAULT_USE_SPATIAL_HASHING = true; | ||
|
|
||
|
|
||
|
|
||
| // Chothia's amino acid atoms vdw radii | ||
|
|
@@ -101,15 +108,19 @@ public void run() { | |
| private int nThreads; | ||
| private Point3d[] spherePoints; | ||
| private double cons; | ||
| private int[][] neighborIndices; | ||
|
|
||
| private boolean useSpatialHashingForNeighbors; | ||
|
|
||
| /** | ||
| * Constructs a new AsaCalculator. Subsequently call {@link #calculateAsas()} | ||
| * or {@link #getGroupAsas()} to calculate the ASAs | ||
| * Only non-Hydrogen atoms are considered in the calculation. | ||
| * @param structure | ||
| * @param probe | ||
| * @param nSpherePoints | ||
| * @param nThreads | ||
| * @param structure the structure, all non-H atoms will be used | ||
| * @param probe the probe size | ||
| * @param nSpherePoints the number of points to be used in generating the spherical | ||
| * dot-density, the more points the more accurate (and slower) calculation | ||
| * @param nThreads the number of parallel threads to use for the calculation | ||
| * @param hetAtoms if true HET residues are considered, if false they aren't, equivalent to | ||
| * NACCESS' -h option | ||
| * @see StructureTools#getAllNonHAtomArray | ||
|
|
@@ -120,16 +131,15 @@ public AsaCalculator(Structure structure, double probe, int nSpherePoints, int n | |
| this.probe = probe; | ||
| this.nThreads = nThreads; | ||
|
|
||
| this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING; | ||
|
|
||
| // initialising the radii by looking them up through AtomRadii | ||
| radii = new double[atomCoords.length]; | ||
| for (int i=0;i<atomCoords.length;i++) { | ||
| radii[i] = getRadius(atoms[i]); | ||
| } | ||
|
|
||
| // initialising the sphere points to sample | ||
| spherePoints = generateSpherePoints(nSpherePoints); | ||
|
|
||
| cons = 4.0 * Math.PI / nSpherePoints; | ||
| initSpherePoints(nSpherePoints); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -148,6 +158,8 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads | |
| this.probe = probe; | ||
| this.nThreads = nThreads; | ||
|
|
||
| this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING; | ||
|
|
||
| for (Atom atom:atoms) { | ||
| if (atom.getElement()==Element.H) | ||
| throw new IllegalArgumentException("Can't calculate ASA for an array that contains Hydrogen atoms "); | ||
|
|
@@ -159,10 +171,7 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads | |
| radii[i] = getRadius(atoms[i]); | ||
| } | ||
|
|
||
| // initialising the sphere points to sample | ||
| spherePoints = generateSpherePoints(nSpherePoints); | ||
|
|
||
| cons = 4.0 * Math.PI / nSpherePoints; | ||
| initSpherePoints(nSpherePoints); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -188,12 +197,21 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int | |
| this.probe = probe; | ||
| this.nThreads = nThreads; | ||
|
|
||
| this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING; | ||
|
|
||
| // initialising the radii to the given radius for all atoms | ||
| radii = new double[atomCoords.length]; | ||
| for (int i=0;i<atomCoords.length;i++) { | ||
| radii[i] = radius; | ||
| } | ||
|
|
||
| initSpherePoints(nSpherePoints); | ||
| } | ||
|
|
||
| private void initSpherePoints(int nSpherePoints) { | ||
|
|
||
| logger.debug("Will use {} sphere points", nSpherePoints); | ||
|
|
||
| // initialising the sphere points to sample | ||
| spherePoints = generateSpherePoints(nSpherePoints); | ||
|
|
||
|
|
@@ -209,7 +227,7 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int | |
| */ | ||
| public GroupAsa[] getGroupAsas() { | ||
|
|
||
| TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<ResidueNumber, GroupAsa>(); | ||
| TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<>(); | ||
|
|
||
| double[] asasPerAtom = calculateAsas(); | ||
|
|
||
|
|
@@ -238,12 +256,26 @@ public double[] calculateAsas() { | |
|
|
||
| double[] asas = new double[atomCoords.length]; | ||
|
|
||
| long start = System.currentTimeMillis(); | ||
| if (useSpatialHashingForNeighbors) { | ||
| logger.debug("Will use spatial hashing to find neighbors"); | ||
| neighborIndices = findNeighborIndicesSpatialHashing(); | ||
| } else { | ||
| logger.debug("Will not use spatial hashing to find neighbors"); | ||
| neighborIndices = findNeighborIndices(); | ||
| } | ||
| long end = System.currentTimeMillis(); | ||
| logger.debug("Took {} s to find neighbors", (end-start)/1000.0); | ||
|
|
||
| start = System.currentTimeMillis(); | ||
| if (nThreads<=1) { // (i.e. it will also be 1 thread if 0 or negative number specified) | ||
| logger.debug("Will use 1 thread for ASA calculation"); | ||
| for (int i=0;i<atomCoords.length;i++) { | ||
| asas[i] = calcSingleAsa(i); | ||
| } | ||
|
|
||
| } else { | ||
| logger.debug("Will use {} threads for ASA calculation", nThreads); | ||
| // NOTE the multithreaded calculation does not scale up well in some systems, | ||
| // why? I guess some memory/garbage collect problem? I tried increasing Xmx in pc8201 but didn't help | ||
|
|
||
|
|
@@ -293,10 +325,22 @@ public double[] calculateAsas() { | |
| while (!threadPool.isTerminated()); | ||
|
|
||
| } | ||
| end = System.currentTimeMillis(); | ||
| logger.debug("Took {} s to calculate all {} atoms ASAs (excluding neighbors calculation)", (end-start)/1000.0, atomCoords.length); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd probably keep invoke this function from a test and time and log there, eventually move it to a benchmark test suite.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My use case was testing other software that uses ASA calculation deep into some function call. I thought this would be a good way of debugging it. Do you see potential performance problems with this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I don't see any real issue. I personally tend to move code from main to tests whenever possible, simply to reduce clutter in the real code. currentTimeMillis probably cost nothing and the other computation in the logging steps seems minimal. I'm transitioning back to java, so I'm not an expert in java just yet. Feel free to ignore my comments here. I'm mostly writing for myself to get my head into some of biojava's code.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kanishka-azimi Thanks for contributing! The more people reviewing PRs the better! |
||
|
|
||
| return asas; | ||
| } | ||
|
|
||
| /** | ||
| * Set the useSpatialHashingForNeighbors flag to use spatial hashing to calculate neighbors (true) or all-to-all | ||
| * distance calculation (false). Default is {@value DEFAULT_USE_SPATIAL_HASHING}. | ||
| * Use for testing performance only. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For small molecules is the old algorithm faster? If so maybe document that here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Performance is essentially the same for small molecules. The gain is very noticeable in very large molecules. |
||
| * @param useSpatialHashingForNeighbors the flag | ||
| */ | ||
| void setUseSpatialHashingForNeighbors(boolean useSpatialHashingForNeighbors) { | ||
| this.useSpatialHashingForNeighbors = useSpatialHashingForNeighbors; | ||
| } | ||
|
|
||
| /** | ||
| * Returns list of 3d coordinates of points on a sphere using the | ||
| * Golden Section Spiral algorithm. | ||
|
|
@@ -317,36 +361,118 @@ private Point3d[] generateSpherePoints(int nSpherePoints) { | |
| } | ||
|
|
||
| /** | ||
| * Returns list of indices of atoms within probe distance to atom k. | ||
| * @param k index of atom for which we want neighbor indices | ||
| * Returns the 2-dimensional array with neighbor indices for every atom. | ||
| * @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. adjacency list vs adjacency matrix...possible prefer one or the other universally based on expected density of graph |
||
| */ | ||
| private Integer[] findNeighborIndices(int k) { | ||
| int[][] findNeighborIndices() { | ||
|
|
||
| // looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30 | ||
| // Thus 40 seems to be a good compromise for the starting capacity | ||
| ArrayList<Integer> neighbor_indices = new ArrayList<>(40); | ||
| int initialCapacity = 60; | ||
|
|
||
| double radius = radii[k] + probe + probe; | ||
| int[][] nbsIndices = new int[atomCoords.length][]; | ||
|
|
||
| for (int i=0;i<atomCoords.length;i++) { | ||
| if (i==k) continue; | ||
| for (int k=0; k<atomCoords.length; k++) { | ||
| double radius = radii[k] + probe + probe; | ||
|
|
||
| List<Integer> thisNbIndices = new ArrayList<>(initialCapacity); | ||
|
|
||
| for (int i = 0; i < atomCoords.length; i++) { | ||
| if (i == k) continue; | ||
|
|
||
| double dist = atomCoords[i].distance(atomCoords[k]); | ||
|
|
||
| if (dist < radius + radii[i]) { | ||
| thisNbIndices.add(i); | ||
| } | ||
| } | ||
|
|
||
| int[] indicesArray = new int[thisNbIndices.size()]; | ||
| for (int i=0;i<thisNbIndices.size();i++) indicesArray[i] = thisNbIndices.get(i); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very minor note. Since random access isn't being used in the array list, could use Queue interface instead of List. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some cute shortcuts - https://stackoverflow.com/questions/960431/how-to-convert-listinteger-to-int-in-java . I don't know if biojava has any general preference regarding guava or apache commons.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! I like the answer that uses the stream API: https://stackoverflow.com/a/23945015/3914327 |
||
| nbsIndices[k] = indicesArray; | ||
| } | ||
| return nbsIndices; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the 2-dimensional array with neighbor indices for every atom, | ||
| * using spatial hashing to avoid all to all distance calculation. | ||
| * @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom | ||
| */ | ||
| int[][] findNeighborIndicesSpatialHashing() { | ||
|
|
||
| double dist = atomCoords[i].distance(atomCoords[k]); | ||
| // looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30 | ||
| int initialCapacity = 60; | ||
|
|
||
| List<Contact> contactList = calcContacts(); | ||
| Map<Integer, List<Integer>> indices = new HashMap<>(atomCoords.length); | ||
| for (Contact contact : contactList) { | ||
| // note contacts are stored 1-way only, with j>i | ||
| int i = contact.getI(); | ||
| int j = contact.getJ(); | ||
|
|
||
| List<Integer> iIndices; | ||
| List<Integer> jIndices; | ||
| if (!indices.containsKey(i)) { | ||
| iIndices = new ArrayList<>(initialCapacity); | ||
| indices.put(i, iIndices); | ||
| } else { | ||
| iIndices = indices.get(i); | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: putIfAbsent |
||
| if (!indices.containsKey(j)) { | ||
| jIndices = new ArrayList<>(initialCapacity); | ||
| indices.put(j, jIndices); | ||
| } else { | ||
| jIndices = indices.get(j); | ||
| } | ||
|
|
||
| if (dist < radius + radii[i]) { | ||
| neighbor_indices.add(i); | ||
| double radius = radii[i] + probe + probe; | ||
| double dist = contact.getDistance(); | ||
| if (dist < radius + radii[j]) { | ||
| iIndices.add(j); | ||
| jIndices.add(i); | ||
| } | ||
| } | ||
|
|
||
| // convert map to array for fast access | ||
| int[][] nbsIndices = new int[atomCoords.length][]; | ||
| for (Map.Entry<Integer, List<Integer>> entry : indices.entrySet()) { | ||
| List<Integer> list = entry.getValue(); | ||
| int[] indicesArray = new int[list.size()]; | ||
| for (int i=0;i<entry.getValue().size();i++) indicesArray[i] = list.get(i); | ||
| nbsIndices[entry.getKey()] = indicesArray; | ||
| } | ||
|
|
||
| Integer[] indicesArray = new Integer[neighbor_indices.size()]; | ||
| indicesArray = neighbor_indices.toArray(indicesArray); | ||
| return indicesArray; | ||
| return nbsIndices; | ||
| } | ||
|
|
||
| Point3d[] getAtomCoords() { | ||
| return atomCoords; | ||
| } | ||
|
|
||
| private List<Contact> calcContacts() { | ||
| double maxRadius = maxValue(radii); | ||
| double cutoff = maxRadius + maxRadius + probe + probe; | ||
| logger.debug("Max radius is {}, cutoff is {}", maxRadius, cutoff); | ||
| Grid grid = new Grid(cutoff); | ||
| grid.addCoords(atomCoords); | ||
| return grid.getIndicesContacts(); | ||
| } | ||
|
|
||
| private static double maxValue(double[] array) { | ||
| double max = array[0]; | ||
| for (int i = 0; i < array.length; i++) { | ||
| if (array[i] > max) { | ||
| max = array[i]; | ||
| } | ||
| } | ||
| return max; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: some stream shortcut could replace this function. |
||
| } | ||
|
|
||
| private double calcSingleAsa(int i) { | ||
| Point3d atom_i = atomCoords[i]; | ||
| Integer[] neighbor_indices = findNeighborIndices(i); | ||
| int n_neighbor = neighbor_indices.length; | ||
|
|
||
| int n_neighbor = neighborIndices[i].length; | ||
| int[] neighbor_indices = neighborIndices[i]; | ||
| int j_closest_neighbor = 0; | ||
| double radius = probe + radii[i]; | ||
|
|
||
|
|
@@ -497,7 +623,7 @@ private static double getRadiusForNucl(NucleotideImpl nuc, Atom atom) { | |
| * | ||
| * If atom is neither part of a nucleotide nor of a standard aminoacid, | ||
| * the default vdw radius for the element is returned. If atom is of | ||
| * unknown type (element) the vdw radius of {@link #Element().N} is returned | ||
| * unknown type (element) the vdw radius of {@link Element().N} is returned | ||
| * | ||
| * @param atom | ||
| * @return | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor point, but maybe we should try to avoid wildcard imports?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's an IntelliJ feature. I do like individual imports better too... I'll have to check if I can configure IntelliJ to do it.