@@ -71,8 +71,19 @@ public class SimpleMMcifConsumer implements MMcifConsumer {
7171 private List <StructConn > structConn ;
7272 private List <StructNcsOper > structNcsOper ;
7373
74+ /**
75+ * A map of asym ids (internal chain ids) to strand ids (author chain ids)
76+ * extracted from pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories
77+ */
7478 private Map <String ,String > asymStrandId ;
7579
80+ /**
81+ * A map of asym ids (internal chain ids) to strand ids (author chain ids)
82+ * extracted from the information in _atom_sites category. Will be used
83+ * if no mapping is found in pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme
84+ */
85+ private Map <String ,String > asymId2StrandIdFromAtomSites ;
86+
7687 private Map <String ,String > asymId2entityId ;
7788
7889 private String current_nmr_model ;
@@ -411,15 +422,15 @@ public void newAtomSite(AtomSite atom) {
411422 if ( params .isParseCAOnly () ){
412423 // yes , user wants to get CA only
413424 // only parse CA atoms...
414- if (! (atom .getLabel_atom_id ().equals ("CA" ) && atom .getType_symbol ().equals ("C" ))) {
425+ if (! (atom .getLabel_atom_id ().equals (StructureTools . CA_ATOM_NAME ) && atom .getType_symbol ().equals ("C" ))) {
415426 //System.out.println("ignoring " + line);
416427 //atomCount--;
417428 return ;
418429 }
419430 }
420431
421-
422-
432+ // filling the map in case there's no pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme in the file
433+ asymId2StrandIdFromAtomSites . put ( atom . getLabel_asym_id (), atom . getAuth_asym_id ());
423434
424435 //see if chain_id is one of the previous chains ...
425436
@@ -566,7 +577,8 @@ public void documentStart() {
566577 entityChains = new ArrayList <Chain >();
567578 structAsyms = new ArrayList <StructAsym >();
568579 asymStrandId = new HashMap <String , String >();
569- asymId2entityId = new HashMap <String ,String >();
580+ asymId2StrandIdFromAtomSites = new HashMap <String , String >();
581+ asymId2entityId = new HashMap <String ,String >();
570582 structOpers = new ArrayList <PdbxStructOperList >();
571583 strucAssemblies = new ArrayList <PdbxStructAssembly >();
572584 strucAssemblyGens = new ArrayList <PdbxStructAssemblyGen >();
@@ -635,30 +647,25 @@ public void documentEnd() {
635647 //TODO: add support for structure.setConnections(connects);
636648
637649
638- // mismatching Author assigned chain IDS and PDB internal chain ids:
639- // fix the chain IDS in the current model:
640-
641- Set <String > asymIds = asymStrandId .keySet ();
642650
643- if (asymIds .isEmpty ()) {
644- logger .warn ("No asym ids mapping found in file (categories pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme). Will create fake asym ids" );
645-
646- if (structure .nrModels ()==0 ) {
647- logger .error ("We should have some models at this point, something is wrong! We'll have an empty structure" );
648- } else {
649- for (Chain chain : structure .getModel (0 )) {
650- asymStrandId .put (chain .getChainID (),chain .getChainID ());
651- }
652- }
651+ boolean noAsymStrandIdMappingPresent = false ;
652+ if (asymStrandId .isEmpty ()) {
653+ logger .warn ("No pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories present. Will use chain id mapping from _atom_sites category" );
654+
655+ asymStrandId = asymId2StrandIdFromAtomSites ;
656+ noAsymStrandIdMappingPresent = true ;
653657 }
658+
659+ // mismatching Author assigned chain IDS and PDB internal chain ids:
660+ // fix the chain IDS in the current model:
654661
655662 for (int i =0 ; i < structure .nrModels () ; i ++){
656663 List <Chain > model = structure .getModel (i );
657664
658665 List <Chain > pdbChains = new ArrayList <Chain >();
659666
660667 for (Chain chain : model ) {
661- for (String asym : asymIds ) {
668+ for (String asym : asymStrandId . keySet () ) {
662669 if ( chain .getChainID ().equals (asym )){
663670 String newChainId = asymStrandId .get (asym );
664671
@@ -703,7 +710,7 @@ public void documentEnd() {
703710 if (entityId ==null ) {
704711 // this can happen for instance if the cif file didn't have _struct_asym category at all
705712 // and thus we have no asymId2entityId mapping at all
706- logger .warn ("No entity id could be found for chain {}" , chain .getInternalChainID ());
713+ logger .warn ("No entity id could be found for chain {}" , chain .getInternalChainID ());
707714 continue ;
708715 }
709716 int eId = Integer .parseInt (entityId );
@@ -736,7 +743,33 @@ public void documentEnd() {
736743
737744 }
738745
746+ if (noAsymStrandIdMappingPresent ) {
747+ // At this point we have to make sure that all chains are polymeric (possibly with some attached non-polymers)
748+ // because that's the current biojava model.
749+ // It can happen that all molecules are assigned to their own chains, for instance in mmCIF files
750+ // produced by phenix (in that case there will be noAsymStrandIdMapping present (no pdbx_poly_seq_scheme))
751+ // mmCIF files produced by the PDB follow the convention: distinct asym_id for every
752+ // molecule (poly or non-poly) whilst a single author_asym_id for polymer + its ligands
753+ it = pdbChains .iterator ();
754+ while (it .hasNext ()) {
755+ Chain chain = it .next ();
756+ GroupType predominantGroupType = StructureTools .getPredominantGroupType (chain );
757+ if (StructureTools .isChainWaterOnly (chain )) {
758+ it .remove ();
759+ logger .warn ("Chain with chain id {} (asym id {}) and {} residues, contains only waters. Will ignore the chain because it doesn't fit into the BioJava structure data model." ,
760+ chain .getChainID (),chain .getInternalChainID (),chain .getAtomGroups ().size ());
761+ }
762+ else if (predominantGroupType != GroupType .AMINOACID &&
763+ predominantGroupType !=GroupType .NUCLEOTIDE ) {
764+ logger .warn ("Chain with chain id {} (asym id {}) and {} residues, does not seem to be polymeric. Will ignore the chain because it doesn't fit into the BioJava structure data model." ,
765+ chain .getChainID (),chain .getInternalChainID (),chain .getAtomGroups ().size ());
766+ it .remove ();
767+ }
768+ }
769+ }
739770 }
771+
772+
740773
741774 // to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the
742775 // compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file
0 commit comments