Skip to content

Commit 903891e

Browse files
committed
Finally fixing #234.
This includes an important bug fix for mmCIF parser: quoted strings appearing at the end of a line with no trailing spaces were not parsed correctly (the last quote character was included in the string). Trailing spaces are not a requirement of the CIF format, thus the parser needs to really work in those cases.
1 parent b6209c2 commit 903891e

File tree

4 files changed

+69
-38
lines changed

4 files changed

+69
-38
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java

Lines changed: 53 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,19 @@ public class SimpleMMcifConsumer implements MMcifConsumer {
7171
private List<StructConn> structConn;
7272
private List<StructNcsOper> structNcsOper;
7373

74+
/**
75+
* A map of asym ids (internal chain ids) to strand ids (author chain ids)
76+
* extracted from pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories
77+
*/
7478
private Map<String,String> asymStrandId;
7579

80+
/**
81+
* A map of asym ids (internal chain ids) to strand ids (author chain ids)
82+
* extracted from the information in _atom_sites category. Will be used
83+
* if no mapping is found in pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme
84+
*/
85+
private Map<String,String> asymId2StrandIdFromAtomSites;
86+
7687
private Map<String,String> asymId2entityId;
7788

7889
private String current_nmr_model ;
@@ -411,15 +422,15 @@ public void newAtomSite(AtomSite atom) {
411422
if ( params.isParseCAOnly() ){
412423
// yes , user wants to get CA only
413424
// only parse CA atoms...
414-
if (! (atom.getLabel_atom_id().equals("CA") && atom.getType_symbol().equals("C"))) {
425+
if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) {
415426
//System.out.println("ignoring " + line);
416427
//atomCount--;
417428
return;
418429
}
419430
}
420431

421-
422-
432+
// filling the map in case there's no pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme in the file
433+
asymId2StrandIdFromAtomSites.put(atom.getLabel_asym_id(), atom.getAuth_asym_id());
423434

424435
//see if chain_id is one of the previous chains ...
425436

@@ -566,7 +577,8 @@ public void documentStart() {
566577
entityChains = new ArrayList<Chain>();
567578
structAsyms = new ArrayList<StructAsym>();
568579
asymStrandId = new HashMap<String, String>();
569-
asymId2entityId = new HashMap<String,String>();
580+
asymId2StrandIdFromAtomSites = new HashMap<String, String>();
581+
asymId2entityId = new HashMap<String,String>();
570582
structOpers = new ArrayList<PdbxStructOperList>();
571583
strucAssemblies = new ArrayList<PdbxStructAssembly>();
572584
strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>();
@@ -635,30 +647,25 @@ public void documentEnd() {
635647
//TODO: add support for structure.setConnections(connects);
636648

637649

638-
// mismatching Author assigned chain IDS and PDB internal chain ids:
639-
// fix the chain IDS in the current model:
640-
641-
Set<String> asymIds = asymStrandId.keySet();
642650

643-
if (asymIds.isEmpty()) {
644-
logger.warn("No asym ids mapping found in file (categories pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme). Will create fake asym ids");
645-
646-
if (structure.nrModels()==0) {
647-
logger.error("We should have some models at this point, something is wrong! We'll have an empty structure");
648-
} else {
649-
for (Chain chain : structure.getModel(0)) {
650-
asymStrandId.put(chain.getChainID(),chain.getChainID());
651-
}
652-
}
651+
boolean noAsymStrandIdMappingPresent = false;
652+
if (asymStrandId.isEmpty()) {
653+
logger.warn("No pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories present. Will use chain id mapping from _atom_sites category");
654+
655+
asymStrandId = asymId2StrandIdFromAtomSites;
656+
noAsymStrandIdMappingPresent = true;
653657
}
658+
659+
// mismatching Author assigned chain IDS and PDB internal chain ids:
660+
// fix the chain IDS in the current model:
654661

655662
for (int i =0; i< structure.nrModels() ; i++){
656663
List<Chain> model = structure.getModel(i);
657664

658665
List<Chain> pdbChains = new ArrayList<Chain>();
659666

660667
for (Chain chain : model) {
661-
for (String asym : asymIds) {
668+
for (String asym : asymStrandId.keySet()) {
662669
if ( chain.getChainID().equals(asym)){
663670
String newChainId = asymStrandId.get(asym);
664671

@@ -703,7 +710,7 @@ public void documentEnd() {
703710
if (entityId==null) {
704711
// this can happen for instance if the cif file didn't have _struct_asym category at all
705712
// and thus we have no asymId2entityId mapping at all
706-
logger.warn("No entity id could be found for chain {}", chain.getInternalChainID());
713+
logger.warn("No entity id could be found for chain {}", chain.getInternalChainID());
707714
continue;
708715
}
709716
int eId = Integer.parseInt(entityId);
@@ -736,7 +743,33 @@ public void documentEnd() {
736743

737744
}
738745

746+
if (noAsymStrandIdMappingPresent) {
747+
// At this point we have to make sure that all chains are polymeric (possibly with some attached non-polymers)
748+
// because that's the current biojava model.
749+
// It can happen that all molecules are assigned to their own chains, for instance in mmCIF files
750+
// produced by phenix (in that case there will be noAsymStrandIdMapping present (no pdbx_poly_seq_scheme))
751+
// mmCIF files produced by the PDB follow the convention: distinct asym_id for every
752+
// molecule (poly or non-poly) whilst a single author_asym_id for polymer + its ligands
753+
it = pdbChains.iterator();
754+
while (it.hasNext()) {
755+
Chain chain = it.next();
756+
GroupType predominantGroupType = StructureTools.getPredominantGroupType(chain);
757+
if (StructureTools.isChainWaterOnly(chain)) {
758+
it.remove();
759+
logger.warn("Chain with chain id {} (asym id {}) and {} residues, contains only waters. Will ignore the chain because it doesn't fit into the BioJava structure data model.",
760+
chain.getChainID(),chain.getInternalChainID(),chain.getAtomGroups().size());
761+
}
762+
else if (predominantGroupType != GroupType.AMINOACID &&
763+
predominantGroupType!=GroupType.NUCLEOTIDE ) {
764+
logger.warn("Chain with chain id {} (asym id {}) and {} residues, does not seem to be polymeric. Will ignore the chain because it doesn't fit into the BioJava structure data model.",
765+
chain.getChainID(),chain.getInternalChainID(),chain.getAtomGroups().size());
766+
it.remove();
767+
}
768+
}
769+
}
739770
}
771+
772+
740773

741774
// to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the
742775
// compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,9 @@ private List<String> processSingleLine(String line){
377377
if (i < line.length() - 1)
378378
nextC = line.charAt(i+1);
379379

380-
Character lastC = null;
380+
Character prevC = null;
381381
if (i>0)
382-
lastC = line.charAt(i-1);
382+
prevC = line.charAt(i-1);
383383

384384
if (c == ' ') {
385385

@@ -398,12 +398,9 @@ private List<String> processSingleLine(String line){
398398

399399
boolean wordEnd = false;
400400
if (! inS2) {
401-
if (nextC != null){
402-
//System.out.println("nextC: >"+nextC+"<");
403-
if ( Character.isWhitespace(nextC)){
404-
i++;
405-
wordEnd = true;
406-
}
401+
if (nextC==null || Character.isWhitespace(nextC)){
402+
i++;
403+
wordEnd = true;
407404
}
408405
}
409406

@@ -420,7 +417,7 @@ private List<String> processSingleLine(String line){
420417
word += c;
421418
}
422419

423-
} else if (lastC==null || lastC==' ') {
420+
} else if (prevC==null || prevC==' ') {
424421
// the beginning of a new string
425422
inString = true;
426423
inS1 = true;
@@ -432,12 +429,9 @@ private List<String> processSingleLine(String line){
432429

433430
boolean wordEnd = false;
434431
if (! inS1) {
435-
if (nextC != null){
436-
//System.out.println("nextC: >"+nextC+"<");
437-
if ( Character.isWhitespace(nextC)){
438-
i++;
439-
wordEnd = true;
440-
}
432+
if (nextC==null || Character.isWhitespace(nextC)){
433+
i++;
434+
wordEnd = true;
441435
}
442436
}
443437

@@ -452,7 +446,7 @@ private List<String> processSingleLine(String line){
452446
} else {
453447
word += c;
454448
}
455-
} else if (lastC==null || lastC==' ') {
449+
} else if (prevC==null || prevC==' ') {
456450
// the beginning of a new string
457451
inString = true;
458452
inS2 = true;

biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestNonDepositedFiles.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ private void checkChains(Structure s) {
194194
* See github issue #234
195195
* @throws IOException
196196
*/
197-
//@Test
197+
@Test
198198
public void testPhenixFile() throws IOException {
199199
InputStream inStream = new GZIPInputStream(this.getClass().getResourceAsStream("/org/biojava/nbio/structure/io/4lup_phenix_output.cif.gz"));
200200
MMcifParser parser = new SimpleMMcifParser();

biojava-structure/src/test/resources/org/biojava/nbio/structure/io/difficult_mmcif_quoting.cif

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ J.Mol.Biol. 309 401 ? 2001 JMOBAK UK 0022-2836 0070 ? 11371161 10
7474
'Acta Crystallogr.,Sect.D' 56 137 ? 2000 ABCRE6 DK 0907-4449 0766 ? 10666593 10.1107/S0907444999015334
7575
2 'Structure of the Capsid of Pf3 Filamentous Phage Determined from X-Ray Fibre Diffraction Data at 3.1 A Resolution'
7676
J.Mol.Biol. 283 155 ? 1998 JMOBAK UK 0022-2836 0070 ? 9761681 10.1006/JMBI.1998.2081
77+
#
78+
_space_group.name_H-M_alt 'P 1 21 1'
79+
_space_group.name_Hall " P 2yb"
80+
_space_group.IT_number 4
81+
_space_group.crystal_system monoclinic
7782
#
7883
loop_
7984
_citation_author.citation_id
@@ -83,4 +88,3 @@ primary 'Pederson, D.M.' 1
8388
primary "Welsh, L.C." 2
8489
primary 'Marvin, D.A.' 3
8590

86-

0 commit comments

Comments
 (0)