|
1 | 1 | # biojava-adam |
2 | 2 |
|
3 | | -BioJava and ADAM integration. |
| 3 | +[Biojava](http://biojava.org) and ADAM integration. |
4 | 4 |
|
5 | 5 | ### Hacking biojava-adam |
6 | 6 |
|
@@ -38,37 +38,58 @@ Type :help for more information. |
38 | 38 | scala> import org.biojava.nbio.adam.BiojavaAdamContext |
39 | 39 | import org.biojava.nbio.adam.BiojavaAdamContext |
40 | 40 |
|
41 | | -scala> val biojavaContext = new BiojavaAdamContext(sc) |
42 | | -biojavaContext: org.biojava.nbio.adam.BiojavaAdamContext = org.biojava.nbio.adam.BiojavaAdamContext@1e041848 |
| 41 | +scala> val bc = BiojavaAdamContext(sc) |
| 42 | +bc: org.biojava.nbio.adam.BiojavaAdamContext = org.biojava.nbio.adam.BiojavaAdamContext@4f8900b0 |
43 | 43 |
|
44 | | -scala> val reads = biojavaContext.loadFastqReads("fastq_sample1.fq") |
45 | | -reads: org.bdgenomics.adam.rdd.sequence.ReadRDD = ReadRDD(MapPartitionsRDD[1] at map at BiojavaAdamContext.java:180,SequenceDictionary{ |
46 | | -H06HDADXX130110:1:2103:11970:57672/2->250 |
47 | | -H06HDADXX130110:2:2116:3345:91806/2->250 |
48 | | -H06HDADXX130110:1:2103:11970:57672/1->250 |
49 | | -H06HDADXX130110:2:2116:3345:91806/1->250 |
50 | | -H06JUADXX130110:1:1108:6424:55322/1->250 |
51 | | -H06JUADXX130110:1:1108:6424:55322/2->250}) |
| 44 | +scala> val reads = bc.loadFastqReads("src/test/resources/org/biojava/nbio/adam/bqsr.0.fq") |
| 45 | +reads: org.bdgenomics.adam.rdd.read.ReadRDD = RDDBoundReadRDD with 0 reference sequences |
52 | 46 |
|
53 | 47 | scala> reads.rdd.first |
54 | | -res0: org.bdgenomics.formats.avro.Read = {"name": "H06HDADXX130110:2:2116:3345:91806/1", "description": |
55 | | -"H06HDADXX130110:2:2116:3345:91806/1", "alphabet": "DNA", "sequence": "GTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTT |
56 | | -AGGGTTAGGGGTAGGGTTAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTAGGGCTAGGGTTAAGGGTAGGGTTAGCGAAAGGGCTG |
57 | | -GGGTTAGGGGTGCGGGTACGCGTAGCATTAGGGCTAGAAGTAGGATCTGCAGTGCCTGACCGCGTCTGCGCGGCGACTGCCCAAAGCCTGGGGCCGACTCCAG |
58 | | -GCTGAAGCTCAT", "length": 250, "qualityScores": ">=<=???>?>???=??>>8<?><=2=<===1194<?;:?>>?#3==>######## |
59 | | -####################################################################################################### |
60 | | -############################################################################################", |
61 | | -"qualityScoreVariant": "FASTQ_SANGER"} |
62 | | -
|
63 | | -scala> val sequences = biojavaContext.loadGenbankDna("SCU49845.gb") |
64 | | -sequences: org.bdgenomics.adam.rdd.sequence.SequenceRDD = SequenceRDD(MapPartitionsRDD[7] at map at BiojavaAdamContext.java:244,SequenceDictionary{ |
65 | | -U49845->5028}) |
66 | | -
|
67 | | -scala> sequences.rdd.first |
68 | | -res1: org.bdgenomics.formats.avro.Sequence = {"name": "U49845", "description": "Saccharomyces cerevisiae |
| 48 | +res0: org.bdgenomics.formats.avro.Read = {"name": "SRR062634.10022079/1", "description": |
| 49 | +"SRR062634.10022079/1", "alphabet": "DNA", "sequence": "AATTCAAAACCAGCCTGGCCAATATGGTGAAACCTCATCTCTACTAAAAA |
| 50 | +TACAAAAATTAGCCAGGCATGGTGGTGCGTGCGTGTAGTCCCAGCTACTT", "length": 100, "qualityScores": "?-DDBEEB=EEEDDEDEEEA |
| 51 | +:D?5?E?CEBE5ED?D:AEDEDEDED-B,BC0AC,BB6@CDBDEC?BCBAA@5,=8CA-?A>?2:&048<BB5BE#####", "qualityScoreVariant": |
| 52 | +"FASTQ_SANGER", "attributes": {}} |
| 53 | +
|
| 54 | +scala> val dna = bc.loadBiojavaFastaDna("src/test/resources/org/biojava/nbio/adam/hla_gen.0.fa") |
| 55 | +dna: org.bdgenomics.adam.rdd.sequence.SequenceRDD = RDDBoundSequenceRDD with 0 reference sequences |
| 56 | +
|
| 57 | +scala> dna.rdd.first |
| 58 | +res0: org.bdgenomics.formats.avro.Sequence = {"name": "HLA:HLA00001 A*01:01:01:01 3503 bp", |
| 59 | +"description": null, "alphabet": "DNA", "sequence": "CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAG |
| 60 | +GGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTT |
| 61 | +CATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCAC |
| 62 | +... |
| 63 | +
|
| 64 | +scala> val prot = bc.loadBiojavaFastaProtein("src/test/resources/org/biojava/nbio/adam/hla_prot.0.fa") |
| 65 | +prot: org.bdgenomics.adam.rdd.sequence.SequenceRDD = RDDBoundSequenceRDD with 0 reference sequences |
| 66 | +
|
| 67 | +scala> prot.rdd.first |
| 68 | +res2: org.bdgenomics.formats.avro.Sequence = {"name": "HLA:HLA00001 A*01:01:01:01 365 bp", "description": |
| 69 | +null, "alphabet": "PROTEIN", "sequence": "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSD |
| 70 | +AASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITK |
| 71 | +RKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVV |
| 72 | +PSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV", "length": |
| 73 | +365, "attributes": {}} |
| 74 | +
|
| 75 | +scala> val genbankDna = bc.loadGenbankDna("src/test/resources/org/biojava/nbio/adam/SCU49845.gb") |
| 76 | +genbankDna: org.bdgenomics.adam.rdd.sequence.SequenceRDD = RDDBoundSequenceRDD with 0 reference sequences |
| 77 | +
|
| 78 | +scala> genbankDna.rdd.first |
| 79 | +res4: org.bdgenomics.formats.avro.Sequence = {"name": "U49845", "description": "Saccharomyces cerevisiae |
69 | 80 | TCP1-beta gene, partial cds; and Axl2p\n(AXL2) and Rev7p (REV7) genes, complete cds.", "alphabet": "DNA", |
70 | 81 | "sequence": "GATCCTCCATATACAACGGTATCTCCACCTCAGGTTTAGATCTCAACAACGGAACCATTGCCGACATGAGACAGTTAGGTATCGTCGAGAGT |
71 | 82 | TACAAGCTAAAACGAGCAGTAGTCAGCTCTGCATCTGAAGCCGCTGAAGTTCTACTAAGGGTGGATAACATCATCCGTGCAAGACCAAGAACCGCCAATAGACAA |
72 | 83 | CATATGTAACATATTTAGGATATACCTCGAAAATAATAAACCGCCACACTGTCATTATTATAATTAGAAACAGAACGCAAAAATTATCCACTATATAATTCAAAG |
73 | 84 | ... |
| 85 | +
|
| 86 | +scala> val features = bc.loadGenbankDnaFeatures("src/test/resources/org/biojava/nbio/adam/SCU49845.gb") |
| 87 | +features: org.bdgenomics.adam.rdd.feature.FeatureRDD = RDDBoundFeatureRDD with 0 reference sequences |
| 88 | +
|
| 89 | +scala> features.rdd.first |
| 90 | +res5: org.bdgenomics.formats.avro.Feature = {"featureId": null, "name": "source", "source": null, |
| 91 | +"featureType": null, "contigName": "U49845", "start": 0, "end": 5028, "strand": "FORWARD", "phase": |
| 92 | +null, "frame": null, "score": null, "geneId": null, "transcriptId": null, "exonId": null, "aliases": |
| 93 | +[], "parentIds": [], "target": null, "gap": null, "derivesFrom": null, "notes": [], "dbxrefs": [], |
| 94 | +"ontologyTerms": [], "circular": null, "attributes": {}} |
74 | 95 | ``` |
0 commit comments