Skip to content

Commit b6209c2

Browse files
committed
Major change of the mmcif parser: now it copes with comments and empty
lines. All tests pass
1 parent 43f32d1 commit b6209c2

File tree

4 files changed

+136
-33
lines changed

4 files changed

+136
-33
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/MMCIFFileTools.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public static String toMMCIF(List<Object> list) {
7979
sb.append(toSingleLineMmCifString(o, sizes));
8080
}
8181

82-
sb.append(SimpleMMcifParser.LOOP_END+newline);
82+
sb.append(SimpleMMcifParser.COMMENT_CHAR+newline);
8383

8484
return sb.toString();
8585
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 86 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public class SimpleMMcifParser implements MMcifParser {
7474
*/
7575
public static final String MMCIF_TOP_HEADER = "data_";
7676

77-
public static final String LOOP_END = "#";
77+
public static final String COMMENT_CHAR = "#";
7878
public static final String LOOP_START = "loop_";
7979
public static final String FIELD_LINE = "_";
8080

@@ -160,6 +160,8 @@ public void parse(BufferedReader buf)
160160
String line = null;
161161

162162
boolean inLoop = false;
163+
boolean inLoopData = false;
164+
163165

164166
List<String> loopFields = new ArrayList<String>();
165167
List<String> lineData = new ArrayList<String>();
@@ -177,33 +179,68 @@ public void parse(BufferedReader buf)
177179
}
178180

179181
while ( (line = buf.readLine ()) != null ){
182+
183+
if (line.isEmpty() || line.startsWith(COMMENT_CHAR)) continue;
180184

181185
logger.debug(inLoop + " " + line);
182186

183187

184-
if ( inLoop){
188+
if ( inLoop) {
185189

186-
if (line.startsWith(LOOP_END) || line.isEmpty()){
187-
// reset all data
188-
inLoop = false;
189-
lineData.clear();
190-
category=null;
190+
191+
if ( line.startsWith(LOOP_START)){
191192
loopFields.clear();
192-
loopWarnings.clear();
193-
logger.debug("Detected LOOP_END: '{}'. Toggling to inLoop=false", LOOP_END);
193+
inLoop = true;
194+
inLoopData = false;
194195
continue;
195-
196-
197196
}
198197

199-
if ( line.matches("\\s*"+FIELD_LINE+"\\w+.*")) {// startsWith(FIELD_LINE)){
198+
if ( line.matches("\\s*"+FIELD_LINE+"\\w+.*")) {
199+
200+
if (inLoopData && line.startsWith(FIELD_LINE)) {
201+
logger.debug("Found a field line after reading loop data. Toggling to inLoop=false");
202+
inLoop = false;
203+
inLoopData = false;
204+
loopFields.clear();
205+
206+
207+
// a boring normal line
208+
List<String> data = processLine(line, buf, 2);
209+
210+
if ( data.size() < 1){
211+
// this can happen if empty lines at end of file
212+
lineData.clear();
213+
continue;
214+
}
215+
String key = data.get(0);
216+
int pos = key.indexOf(".");
217+
if ( pos < 0 ) {
218+
// looks like a chem_comp file
219+
// line should start with data, otherwise something is wrong!
220+
if (! line.startsWith(MMCIF_TOP_HEADER)){
221+
logger.warn("This does not look like a valid mmCIF file! The first line should start with 'data_', but is '" + line+"'");
222+
triggerDocumentEnd();
223+
return;
224+
}
225+
// ignore the first line...
226+
category=null;
227+
lineData.clear();
228+
continue;
229+
}
230+
category = key.substring(0,pos);
231+
String value = data.get(1);
232+
loopFields.add(key.substring(pos+1,key.length()));
233+
lineData.add(value);
234+
235+
logger.debug("Found data for category {}: {}", key, value);
236+
continue;
237+
}
238+
200239
// found another field.
201240
String txt = line.trim();
202-
//System.out.println("line: " + txt);
203241
if ( txt.indexOf('.') > -1){
204242

205243
String[] spl = txt.split("\\.");
206-
//System.out.println(spl.length);
207244
category = spl[0];
208245
String attribute = spl[1];
209246
loopFields.add(attribute);
@@ -214,7 +251,7 @@ public void parse(BufferedReader buf)
214251

215252
} else {
216253
category = txt;
217-
logger.debug("Found category: {}",category);
254+
logger.debug("Found category without attribute: {}",category);
218255
}
219256

220257

@@ -223,41 +260,39 @@ public void parse(BufferedReader buf)
223260
// in loop and we found a data line
224261
lineData = processLine(line, buf, loopFields.size());
225262
logger.debug("Found a loop data line with {} data fields", lineData.size());
263+
logger.debug("Data fields: {}", lineData.toString());
226264
if ( lineData.size() != loopFields.size()){
227265
logger.warn("Expected {} data fields, but found {} in line: {}",loopFields.size(),lineData.size(),line);
228266

229267
}
230268

231-
endLineChecks(category, loopFields,lineData, loopWarnings);
269+
endLineChecks(category, loopFields, lineData, loopWarnings);
232270

233271
lineData.clear();
234272

273+
inLoopData = true;
235274
}
236275

237276
} else {
238277
// not in loop
239278

240279
if ( line.startsWith(LOOP_START)){
241-
loopFields.clear();
242-
loopWarnings.clear();
243-
inLoop = true;
244-
category=null;
245-
lineData.clear();
246-
logger.debug("Detected LOOP_START: '{}'. Toggling to inLoop=true", LOOP_START);
247-
continue;
248-
} else if (line.startsWith(LOOP_END) || line.isEmpty()){
249-
inLoop = false;
250280
if ( category != null)
251281
endLineChecks(category, loopFields, lineData, loopWarnings);
282+
283+
resetBuffers(loopFields, lineData, loopWarnings);
252284
category = null;
253-
loopFields.clear();
254-
loopWarnings.clear();
255-
lineData.clear();
285+
inLoop = true;
286+
inLoopData = false;
287+
logger.debug("Detected LOOP_START: '{}'. Toggling to inLoop=true", LOOP_START);
288+
continue;
256289
} else {
290+
logger.debug("Normal line ");
291+
inLoop = false;
292+
257293
// a boring normal line
258-
//System.out.println("boring data line: " + line + " " + inLoop + " " );
259294
List<String> data = processLine(line, buf, 2);
260-
//System.out.println("got a single line " + data);
295+
261296
if ( data.size() < 1){
262297
// this can happen if empty lines at end of file
263298
lineData.clear();
@@ -278,15 +313,30 @@ public void parse(BufferedReader buf)
278313
lineData.clear();
279314
continue;
280315
}
316+
317+
if (category!=null && !key.substring(0,pos).equals(category)) {
318+
// we've changed category: need to flush the last one
319+
endLineChecks(category, loopFields, lineData, loopWarnings);
320+
resetBuffers(loopFields, lineData, loopWarnings);
321+
}
322+
281323
category = key.substring(0,pos);
324+
282325
String value = data.get(1);
283326
loopFields.add(key.substring(pos+1,key.length()));
284327
lineData.add(value);
285328

329+
logger.debug("Found data for category {}: {}", key, value);
286330

287331
}
288332
}
289333
}
334+
335+
if (category!=null && lineData.size()>0 && lineData.size()==loopFields.size()) {
336+
// the last category in the file will still be missing
337+
endLineChecks(category, loopFields, lineData, loopWarnings);
338+
resetBuffers(loopFields, lineData, loopWarnings);
339+
}
290340

291341
if (struct != null){
292342
triggerStructData(struct);
@@ -295,6 +345,12 @@ public void parse(BufferedReader buf)
295345
triggerDocumentEnd();
296346

297347
}
348+
349+
private void resetBuffers(List<String> loopFields, List<String> lineData, Set<String> loopWarnings) {
350+
loopFields.clear();
351+
lineData.clear();
352+
loopWarnings.clear();
353+
}
298354

299355
private List<String> processSingleLine(String line){
300356

biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestDifficultMmCIFFiles.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,4 +194,37 @@ public void testQuotingCornerCase () throws IOException {
194194

195195

196196
}
197+
198+
/**
199+
* The last category in 2KLI mmCIF file is _pdbx_struct_oper_list, which is needed for
200+
* the biounit annotation.
201+
* This tests makes sure that the last category in a mmCIF file is not missed because
202+
* of its position as last one in file.
203+
* @throws IOException
204+
* @throws StructureException
205+
*/
206+
@Test
207+
public void test2KLI() throws IOException, StructureException {
208+
209+
AtomCache cache = new AtomCache();
210+
211+
StructureIO.setAtomCache(cache);
212+
213+
FileParsingParameters params = cache.getFileParsingParams();
214+
params.setParseBioAssembly(true);
215+
StructureIO.setAtomCache(cache);
216+
217+
218+
cache.setUseMmCif(true);
219+
Structure sCif = StructureIO.getStructure("2KLI");
220+
221+
assertNotNull(sCif);
222+
223+
assertNotNull(sCif.getPDBHeader().getBioAssemblies());
224+
225+
Map<Integer,BioAssemblyInfo> mapCif = sCif.getPDBHeader().getBioAssemblies();
226+
227+
assertNotNull(mapCif);
228+
229+
}
197230
}

biojava-structure/src/test/resources/org/biojava/nbio/structure/io/difficult_mmcif_quoting.cif

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ _atom_site.label_alt_id
88
_atom_site.label_comp_id
99
_atom_site.label_asym_id
1010
_atom_site.label_entity_id
11+
# a comment line
1112
_atom_site.label_seq_id
1213
_atom_site.pdbx_PDB_ins_code
1314
_atom_site.Cartn_x
@@ -22,6 +23,7 @@ _atom_site.auth_atom_id
2223
_atom_site.pdbx_PDB_model_num
2324
ATOM 1728 O OP1 . DT C 2 1 ? 7.732 19.982 88.407 0.00 20.42 107 DT B OP1 1
2425
ATOM 1730 O O5' . DT C 2 1 ? 7.464 18.547 86.371 0.00 21.57 107 DT B O5' 1
26+
# a comment line
2527
ATOM 1738 C H2" . DT C 2 1 ? 8.111 19.111 84.111 0.00 29.00 107 DT B H2" 1
2628
ATOM 1730 O "O3'" . DT C 2 1 ? 7.111 18.111 86.111 0.00 21.00 107 DT B "O3'" 1
2729
#
@@ -32,6 +34,17 @@ _audit_author.pdbx_ordinal
3234
'Welsh, L.C.' 2
3335
"Marvin, D.A." 3
3436
#
37+
_exptl_crystal.id 1
38+
_exptl_crystal.density_meas ?
39+
_exptl_crystal.density_Matthews ?
40+
# a comment line
41+
_exptl_crystal.density_percent_sol ?
42+
_exptl_crystal.description
43+
;THE DATA IS DERIVED FROM CONTINUOUS TRANSFORM DATA AND THEREFORE THE NUMBER OF UNIQUE REFLECTIONS IS A MEANINGLESS NUMBER. THE STRUCTURE WAS REFINED AGAI THE SAME STRUCTURE FACTORS AS PDB ENTRY 1HGV, USIN
44+
G DIFFERENT REFINEMENT PROTOCOL. THEREFORE, THE STRUCT FACTORS FOR 1HHO, CAN BE TAKEN FROM R1HGVSF
45+
;
46+
#
47+
# a comment line
3548
loop_
3649
_pdbx_database_related.db_name
3750
_pdbx_database_related.db_id
@@ -42,7 +55,7 @@ PDB 1QL1 unspecified 'INOVIRUS (FILAMENTOUS BACTERIOPHAGE) STRAIN PF1 MAJOR COAT
4255
#
4356
loop_
4457
_citation.id
45-
_citation.title
58+
_citation.title
4659
_citation.journal_abbrev
4760
_citation.journal_volume
4861
_citation.page_first
@@ -69,4 +82,5 @@ _citation_author.ordinal
6982
primary 'Pederson, D.M.' 1
7083
primary "Welsh, L.C." 2
7184
primary 'Marvin, D.A.' 3
72-
#
85+
86+

0 commit comments

Comments
 (0)