-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathChainAligner.java
More file actions
165 lines (149 loc) · 6.14 KB
/
ChainAligner.java
File metadata and controls
165 lines (149 loc) · 6.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
package demo;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.align.ce.CeCPMain;
import org.biojava.nbio.structure.align.fatcat.FatCatRigid;
import org.biojava.nbio.structure.align.model.AFPChain;
import org.biojava.spark.function.AlignmentTools;
import org.biojava.spark.utils.BiojavaSparkUtils;
import org.rcsb.mmtf.api.StructureDataInterface;
import org.rcsb.mmtf.spark.data.StructureDataRDD;
import org.rcsb.mmtf.spark.utils.SparkUtils;
import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;
import scala.Tuple2;
import scala.Tuple3;
import scala.Tuple6;
/**
* Demo code of clustering C-alpha chains.
* @author Anthony Bradley
*
*/
public class ChainAligner {
private static final String tmpDirName = "SCORES";
private static final String defaultPath = "";
/**
* Cluster the C-alpha chains of a set of PDB ids.
* @param args the input args - currently none taken
* @throws IOException an error reading from the URL or the seqeunce file
*/
public static void main(String[] args) throws IOException {
// Read the arguments
Namespace ns = parseArgs(args);
// Get the actual arguments
String alignMethod = ns.getString("align");
String filePath = ns.getString("hadoop");
int minLength = ns.getInt("minlength");
double sample = ns.getDouble("sample");
boolean useFiles = ns.getBoolean("files");
// Get the list of PDB ids
List<String> pdbIdList = ns.<String> getList("pdbId");
// Get the chains that correpspond to that
JavaPairRDD<String, Atom[]> chainRDD;
if(pdbIdList.size()>0){
if(useFiles==true){
StructureDataRDD structureDataRDD = new StructureDataRDD(
BiojavaSparkUtils.getFromList(convertToFiles(pdbIdList))
.mapToPair(t -> new Tuple2<String, StructureDataInterface>(t._1, BiojavaSparkUtils.convertToStructDataInt(t._2))));
chainRDD = BiojavaSparkUtils.getChainRDD(structureDataRDD, minLength);
}
else{
chainRDD = BiojavaSparkUtils.getChainRDD(pdbIdList, minLength);
}
}
else if(!filePath.equals(defaultPath)){
chainRDD = BiojavaSparkUtils.getChainRDD(filePath, minLength, sample);
}
else{
System.out.println("Must specify PDB ids or an hadoop sequence file");
return;
}
System.out.println("Analysisng " + chainRDD.count() + " chains");
JavaPairRDD<Tuple2<String,Atom[]>,Tuple2<String, Atom[]>> comparisons = SparkUtils.getHalfCartesian(chainRDD, chainRDD.getNumPartitions());
JavaRDD<Tuple3<String, String, AFPChain>> similarities = comparisons.map(t -> new Tuple3<String, String, AFPChain>(t._1._1, t._2._1,
AlignmentTools.getBiojavaAlignment(t._1._2, t._2._2, alignMethod)));
JavaRDD<Tuple6<String, String, Double, Double, Double, Double>> allScores = similarities.map(t -> new Tuple6<String, String, Double, Double, Double, Double>(
t._1(), t._2(), t._3().getTMScore(), t._3().getTotalRmsdOpt(), (double) t._3().getTotalLenOpt(), t._3().getAlignScore())).cache();
if(alignMethod.equals("DUMMY")){
JavaDoubleRDD doubleDist = allScores.mapToDouble(t -> t._3());
System.out.println("Average dist: "+doubleDist.mean());
}
else{
writeData(allScores);
}
}
/**
* Convert a list of {@link String}s to an array of {@link File}s
* @param pdbIdList the input list of {@link String}s
* @return the array of {@link File}s
*/
private static File[] convertToFiles(List<String> pdbIdList) {
File[] outList = new File[pdbIdList.size()];
for (int i=0; i<pdbIdList.size(); i++) {
outList[i] = new File(pdbIdList.get(i));
}
return outList;
}
/**
* Write out the data and combine into a single file.
* @param inputScores the input scores
* @throws IOException an error writing to a file on the file system
*/
private static void writeData(JavaRDD<Tuple6<String, String, Double, Double, Double, Double>> inputScores) throws IOException {
// Now write out the Matrices and the Graphs using these RDDs
inputScores.map(t -> t._1()+","+t._2()+","+t._3()+","+t._4()+","+t._5()+","+t._6())
.saveAsTextFile(tmpDirName);
File outFile = new File(tmpDirName);
SparkUtils.combineDirToFile(outFile, "ID ONE, ID TWO, TM SCORE, RMSD, LENGTH, ALIGN\n");
FileUtils.deleteDirectory(outFile);
}
/**
* Parse the input arguments and return the {@link Namespace} object.
* @param args the input argument list
* @return the parsed arguments as a {@link Namespace} object
*/
private static Namespace parseArgs(String[] args) {
ArgumentParser parser = ArgumentParsers.newArgumentParser("Align")
.defaultHelp(true)
.description("Calculate the alignment of multiple structures");
parser.addArgument("-a", "--align")
.choices(CeCPMain.algorithmName, FatCatRigid.algorithmName, "DUMMY").setDefault(FatCatRigid.algorithmName)
.help("Specify alignment method");
parser.addArgument("-s", "--score")
.choices("TM","RMSD").setDefault("TM")
.help("Specify scoring method");
parser.addArgument("-u", "--files").type(Boolean.class)
.setDefault(false);
parser.addArgument("-k", "--numclusts")
.help("The number of clusters").setDefault(2);
parser.addArgument("-c", "--cluster")
.choices("PIC").setDefault("PIC")
.help("Specify clustering method");
parser.addArgument("pdbId").nargs("*")
.help("The PDB Ids to consider");
parser.addArgument("-z", "--sample").type(Double.class)
.help("The sample of the PDB to take").setDefault(1.00);
parser.addArgument("-l", "--minlength").type(Double.class)
.help("The minimum length of each chain").setDefault(60);
parser.addArgument("-t", "--threshold").type(Double.class)
.help("The threshold to define an edge").setDefault(0.5);
parser.addArgument("-f", "--hadoop")
.help("The hadoop file to read from").setDefault(defaultPath);
Namespace ns = null;
try {
ns = parser.parseArgs(args);
} catch (ArgumentParserException e) {
parser.handleError(e);
System.exit(1);
}
return ns;
}
}