Skip to content

Commit 87b6be7

Browse files
authored
Merge pull request #706 from josemduarte/issue704
Some fixes related to caching and mmcif parsing
2 parents 482df9e + 2b1860b commit 87b6be7

File tree

6 files changed

+142
-43
lines changed

6 files changed

+142
-43
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/util/FlatFileCache.java

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,23 @@
2929

3030
import java.io.*;
3131

32-
/** Provides a cache for storing multiple small files in memory. Can be used to e.g cache gzip compressed PDB files for avoiding disk IO bottlenecks.
33-
*
32+
/**
33+
* Provides a cache for storing multiple small files in memory. Can be used to e.g cache gzip compressed PDB files
34+
* for avoiding disk IO bottlenecks.
35+
* Note this is just a wrapper for the singleton cache.
36+
*
3437
* @author Andreas Prlic.
3538
*
3639
*/
3740
public class FlatFileCache {
3841

3942
private final static Logger logger = LoggerFactory.getLogger(FlatFileCache.class);
4043

41-
private static FlatFileCache me ;
42-
44+
/**
45+
* The cache singleton.
46+
*/
4347
private static SoftHashMap<String, byte[]> cache = new SoftHashMap<String, byte[]>(0);
4448

45-
public static FlatFileCache getInstance() {
46-
47-
if ( me == null){
48-
me = new FlatFileCache();
49-
}
50-
51-
return me;
52-
}
5349

5450
// no public constructor;
5551
private FlatFileCache(){
@@ -109,20 +105,17 @@ public static InputStream getInputStream(String key){
109105

110106
}
111107

112-
public int size() {
108+
public static int size() {
113109
if ( cache != null)
114110
return cache.size();
115111
else
116112
return -1;
117113
}
118114

119-
public void clear(){
115+
public static void clear(){
120116
cache.clear();
121117
}
122118

123-
public static void destroy(){
124-
me.clear();
125-
me = null;
126-
}
119+
127120

128121
}

biojava-core/src/main/java/org/biojava/nbio/core/util/InputStreamProvider.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,14 @@ public class InputStreamProvider {
6565

6666
private boolean cacheRawFiles ;
6767

68-
FlatFileCache cache ;
6968
public InputStreamProvider() {
7069
super();
7170
cacheRawFiles = false;
7271

7372
String prop = System.getProperty(CACHE_PROPERTY);
7473
if ( prop != null && prop.equals("true")) {
7574
cacheRawFiles = true;
76-
cache = FlatFileCache.getInstance();
75+
7776
}
7877

7978
}

biojava-structure/src/main/java/org/biojava/nbio/structure/align/client/FarmJobRunnable.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ protected void sendResultsToServer(List<String> results) {
611611
if ( progressListeners != null)
612612
notifySubmittingAlignments(results.size(), msg);
613613
logger.info("{}: Sent {} results to server. job status: {}", userName, results.size(), counter);
614-
logger.info("{}: fileCache size: {}", userName, FlatFileCache.getInstance().size());
614+
logger.info("{}: fileCache size: {}", userName, FlatFileCache.size());
615615
}
616616

617617

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/DownloadChemCompProvider.java

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import java.io.InputStreamReader;
3030
import java.io.PrintWriter;
3131
import java.io.StringWriter;
32-
import java.net.HttpURLConnection;
3332
import java.net.URL;
3433
import java.net.URLConnection;
3534
import java.nio.file.Files;
@@ -49,14 +48,15 @@
4948

5049

5150

52-
/** This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web site.
53-
* It is the default way to access these definitions.
54-
* If this provider is called he first time, it will download and install all chemical
55-
* component definitions in a local directory.
56-
* Once the definition files have been installed, it has quick startup time and low memory requirements.
51+
/**
52+
* This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web site.
53+
* It is the default way to access these definitions.
54+
* If this provider is called he first time, it will download and install all chemical
55+
* component definitions in a local directory.
56+
* Once the definition files have been installed, it has quick startup time and low memory requirements.
5757
*
58-
* An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, that
59-
* does not require any network access, but only can support a limited set of chemical component definitions, is the {@link ReducedChemCompProvider}.
58+
* An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, that
59+
* does not require any network access, but only can support a limited set of chemical component definitions, is the {@link ReducedChemCompProvider}.
6060
*
6161
*
6262
* @author Andreas Prlic
@@ -310,7 +310,8 @@ public ChemComp getChemComp(String recordName) {
310310

311311
}
312312

313-
/** Returns the file name that contains the definition for this {@link ChemComp}
313+
/**
314+
* Returns the file name that contains the definition for this {@link ChemComp}
314315
*
315316
* @param recordName the ID of the {@link ChemComp}
316317
* @return full path to the file
@@ -359,6 +360,7 @@ private static boolean downloadChemCompRecord(String recordName) {
359360
File newFile;
360361
try{
361362
newFile = File.createTempFile("chemcomp"+recordName, "cif");
363+
logger.debug("Will write chem comp file to temp file {}", newFile.toString());
362364
}
363365
catch(IOException e){
364366
logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir"));
@@ -379,7 +381,6 @@ private static boolean downloadChemCompRecord(String recordName) {
379381

380382
try {
381383
url = new URL(u);
382-
383384
URLConnection uconn = URLConnectionTools.openURLConnection(url);
384385

385386
try( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile)));

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -211,24 +211,28 @@ public void parse(BufferedReader buf)
211211
Set<String> loopWarnings = new HashSet<String>(); // used only to reduce logging statements
212212

213213
String category = null;
214-
215-
216-
// the first line is a data_PDBCODE line, test if this looks like a mmcif file
217-
line = buf.readLine();
218-
if (line == null || !line.startsWith(MMCIF_TOP_HEADER)){
219-
logger.error("This does not look like a valid mmCIF file! The first line should start with 'data_', but is: '" + line+"'");
220-
triggerDocumentEnd();
221-
return;
222-
}
214+
215+
boolean foundHeader = false;
223216

224217
while ( (line = buf.readLine ()) != null ){
225218

226219
if (line.isEmpty() || line.startsWith(COMMENT_CHAR)) continue;
227220

221+
if (!foundHeader) {
222+
// the first non-comment line is a data_PDBCODE line, test if this looks like a mmcif file
223+
if (line.startsWith(MMCIF_TOP_HEADER)){
224+
foundHeader = true;
225+
continue;
226+
} else {
227+
triggerDocumentEnd();
228+
throw new IOException("This does not look like a valid mmCIF file! The first line should start with 'data_', but is: '" + line+"'");
229+
}
230+
}
231+
228232
logger.debug(inLoop + " " + line);
229233

230234
if (line.startsWith(MMCIF_TOP_HEADER)){
231-
// either first line in file, or beginning of new section
235+
// either first line in file, or beginning of new section (data block in CIF parlance)
232236
if ( inLoop) {
233237
//System.out.println("new data and in loop: " + line);
234238
inLoop = false;

biojava-structure/src/test/java/org/biojava/nbio/structure/TestDownloadChemCompProvider.java

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,21 @@
2020
*/
2121
package org.biojava.nbio.structure;
2222

23-
import junit.framework.TestCase;
23+
import org.biojava.nbio.core.util.FlatFileCache;
2424
import org.biojava.nbio.structure.io.mmcif.DownloadChemCompProvider;
2525
import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
26+
import org.junit.Test;
27+
import static org.junit.Assert.*;
2628

27-
public class TestDownloadChemCompProvider extends TestCase{
29+
import java.io.File;
30+
import java.io.FileOutputStream;
31+
import java.io.IOException;
32+
import java.io.PrintWriter;
33+
import java.util.zip.GZIPOutputStream;
2834

35+
public class TestDownloadChemCompProvider {
36+
37+
@Test
2938
public void testProtectedIDs(){
3039

3140
String id = "CON";
@@ -36,5 +45,98 @@ public void testProtectedIDs(){
3645

3746
assertEquals(cc.getId(), id);
3847
}
48+
49+
@Test
50+
public void testRedirectWorks() {
51+
// since August 2017, RCSB is redirecting:
52+
// http://rcsb.org/pdb/files/ligand/HEM.cif ----> http://files.org/ligands/HEM.cif
53+
// see #703
54+
55+
File file = new File(DownloadChemCompProvider.getLocalFileName("HEM"));
56+
file.delete();
57+
58+
DownloadChemCompProvider prov = new DownloadChemCompProvider();
59+
60+
DownloadChemCompProvider.serverBaseUrl = "http://www.rcsb.org/pdb/files/ligand/";
61+
62+
ChemComp cc = prov.getChemComp("HEM");
63+
64+
//System.out.println(file.toString());
65+
66+
assertTrue(file.exists());
67+
68+
// just in case the we did get garbage, let's clean up
69+
file.delete();
70+
71+
// very important: we have a memory cache of files, we need to reset it not to pollute the cache for later tests
72+
FlatFileCache.clear();
73+
74+
assertNotNull(cc);
75+
76+
assertNotNull(cc.getName());
77+
78+
// reset to default URL or otherwise we could affect other tests
79+
DownloadChemCompProvider.serverBaseUrl = DownloadChemCompProvider.DEFAULT_SERVER_URL;
80+
}
81+
82+
@Test
83+
public void testWeDontCacheGarbage() {
84+
// see #703
85+
86+
File file = new File(DownloadChemCompProvider.getLocalFileName("HEM"));
87+
88+
file.delete();
89+
90+
DownloadChemCompProvider prov = new DownloadChemCompProvider();
91+
92+
// a fake URL that should give a 404
93+
DownloadChemCompProvider.serverBaseUrl = "http://www.rcsb.org/non-existent-ligand-url/";
94+
95+
ChemComp cc = prov.getChemComp("HEM");
96+
97+
// we got a 404 back from server so we shouldn't have cached a file
98+
assertTrue(!file.exists());
99+
100+
file.delete();
101+
102+
// very important: we have a memory cache of files, we need to reset it not to pollute the cache for later tests
103+
FlatFileCache.clear();
104+
105+
// we couldn't parse, thus there should be no content
106+
assertNull(cc.getName());
107+
108+
// reset to default URL or otherwise we could affect other tests
109+
DownloadChemCompProvider.serverBaseUrl = DownloadChemCompProvider.DEFAULT_SERVER_URL;
110+
111+
112+
}
113+
114+
@Test
115+
public void testIfWeCachedGarbageWeCanDetectIt() throws IOException {
116+
// see #703
117+
// TODO this test for the moment only asserts that we get an empty chemcomp, since we can't detect bad cached files yet
118+
119+
File file = new File(DownloadChemCompProvider.getLocalFileName("HEM"));
120+
121+
PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(file)));
122+
pw.println("A lot of garbage");
123+
pw.close();
124+
125+
DownloadChemCompProvider prov = new DownloadChemCompProvider();
126+
127+
ChemComp cc = prov.getChemComp("HEM");
128+
129+
assertTrue(file.exists());
130+
131+
file.delete();
132+
133+
// very important: we have a memory cache of files, we need to reset it not to pollute the cache for later tests
134+
FlatFileCache.clear();
135+
136+
assertNull(cc.getName());
137+
138+
139+
140+
}
39141

40142
}

0 commit comments

Comments
 (0)