forked from fmalmeida/pythonScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_dna_features.py
More file actions
236 lines (188 loc) · 10.4 KB
/
plot_dna_features.py
File metadata and controls
236 lines (188 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/env python
# coding: utf-8
## Def help message
"""
A simple script to plot DNA features using the DNA features python package
Copyright (C) 2020 Felipe Marques de Almeida (almeidafmarques@gmail.com)
License: Public Domain
Usage:
plot_dna_features.py
plot_dna_features.py -h|--help
plot_dna_features.py -v|--version
plot_dna_features.py check-gff (--input <gff>)
plot_dna_features.py (--input <gff> | --fofn <file>) (--start <start_base> --end <end_base> --contig <contig_name>) [--feature <feature_type> --identification <id> --title <title> --label <label> --color <color> --output <png_out> --width <width> --height <height>]
Options:
-h --help Show this screen.
-v --version Show version information
check-gff Does a simple parsing of the GFF file so the user knows the available qualifiers that
can be used as gene identifiers. GFF qualifiers are retrieved from the 9th column.
--input=<gff> Used to plot dna features from a single GFF file
--fofn=<file> Used to plot dna features from multiple GFF files. Contents must be in csv format with 3 columns:
gff,custom_label,color (HEX format). Features from each GFF will have the color set in the 3rd column,
labeled as -> 'custom_label: gene id'.
--start=<start_base> Starting position for plotting.
--end=<end_base> Ending position for plotting.
--contig=<contig_name> Name of the contig which you want to plot.
--identification=<id> Which GFF qualifier must be used as gene identification?
Please check for available qualifiers with 'check-gff' [default: ID]
--title=<title> Plot title [default: Gene Plot].
--label=<label> Custom label for plotting. Legends will be in the following format: 'Custom label: gene id' [default: Gene].
--feature=<feature_type> Type of the GFF feature (3rd column) which you want to plot [default: gene].
--color=<color> HEX entry for desired plotting color [default: #ccccff].
--width=<width> Plot width ratio [default: 20].
--height=<height> Plot height ratio [default: 5].
--output=<png_out> Output PNG filename [default: ./out.png].
"""
##################################
### Loading Necessary Packages ###
##################################
from docopt import docopt
from dna_features_viewer import *
import Bio.SeqIO
import pprint
from BCBio import GFF
from BCBio.GFF import GFFExaminer
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
##################################################
### Function for checking available qualifiers ###
##################################################
def check_gff(infile):
# GFF overview
print("GFF overview:\n")
examiner = GFFExaminer()
in_handle = open(infile)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()
print("")
# Load GFF and its sequences
gff = GFF.parse(infile)
# Check qualifiers
for rec in gff:
print("Example of the GFF's first line available qualifiers from the 9th column:\n")
print(rec.features[0])
print("\nPlease select only one of the available qualifiers to be used as gene identification!")
exit()
######################################################
### Function for execution with a single GFF input ###
######################################################
def single_gff(infile, start, end, contig, feature, qualifier, coloring, custom_label, outfile, plot_title, plot_width, plot_height):
# Subset GFF based on chr and feature type
limit_info = dict(
gff_id = [contig],
gff_type = [feature]
)
# Load GFF and its sequences
gff = GFF.parse(infile, limit_info=limit_info)
# Create empty features and legened list
features = []
## Populate features list
## Filtering by location
start_nt = int(start)
end_nt = int(end)
length = end_nt - start_nt
for rec in gff:
for i in range(0, len(rec.features)):
if ( int(rec.features[i].location.start) >= int(start_nt) and int(rec.features[i].location.end) <= int(end_nt) ):
if (str(rec.features[i].location.strand) == "+"):
strand=+1
else:
strand=-1
# Create input string
## Label in the gene plot
# input= GraphicFeature(start=int(rec.features[i].location.start), end=int(rec.features[i].location.end),
# strand=int(strand), label="{0}: {1}".format(custom_label, str(rec.features[i].qualifiers[qualifier][0])), color=coloring)
## Label not in the gene plot
input= GraphicFeature(start=int(rec.features[i].location.start), end=int(rec.features[i].location.end),
strand=int(strand), label=str(rec.features[i].qualifiers[qualifier][0]), color=coloring)
# Append
features.append(input)
# Draw plot
record = GraphicRecord(sequence_length=length, features=features, first_index=start_nt)
ax, _ = record.plot(figure_width=int(plot_width), figure_height=int(plot_height))
## Label not in the gene plot (using separate legend box)
legend = ax.legend(handles=[mpatches.Patch(facecolor=coloring, label="{0}".format(custom_label), linewidth = 0.5, edgecolor = 'black')],
loc = 1, title=plot_title, fontsize = 'medium', fancybox = True)
ax.figure.savefig(outfile, bbox_inches='tight')
#######################################################
### Function for execution with multiple GFF inputs ###
#######################################################
def multiple_gff(input_fofn, start, end, contig, qualifier, feature, outfile, plot_title, plot_width, plot_height):
# Open list of filenames containing GFFs
file = open(input_fofn, 'r')
content = file.readlines()
# Create empty features and legened list
features = []
legend_entries = []
# Begin Parsing
for line in content:
data = line.strip().split(",", 3)
infile = data[0]
labeling = data[1]
coloring = data[2]
# Subset GFF based on chr and feature type
limit_info = dict(
gff_id = [contig],
gff_type = [feature]
)
# Load GFF and its sequences
gff = GFF.parse(infile, limit_info=limit_info)
## Populate features list
## Filtering by location
start_nt = int(start)
end_nt = int(end)
length = end_nt - start_nt
for rec in gff:
for i in range(0, len(rec.features)):
if ( int(rec.features[i].location.start) >= int(start_nt) and int(rec.features[i].location.end) <= int(end_nt) ):
if (str(rec.features[i].location.strand) == "+"):
strand=+1
else:
strand=-1
# Create input string
## Label in the gene plot
# input= GraphicFeature(start=int(rec.features[i].location.start), end=int(rec.features[i].location.end),
# strand=int(strand), label="{0}: {1}".format(labeling, str(rec.features[i].qualifiers[qualifier][0])), color=coloring)
## Label not in the gene plot
input= GraphicFeature(start=int(rec.features[i].location.start), end=int(rec.features[i].location.end),
strand=int(strand), label=str(rec.features[i].qualifiers[qualifier][0]), color=coloring)
# Append DNA features plot
features.append(input)
# Append to legend
legend_entries.append(
mpatches.Patch(facecolor=coloring, label="{0}".format(labeling), linewidth = 0.5, edgecolor = 'black')
)
# Draw plot
record = GraphicRecord(sequence_length=length, features=features, first_index=start_nt)
ax, _ = record.plot(figure_width=int(plot_width), figure_height=int(plot_height))
## Label not in the gene plot (using separate legend box)
legend = ax.legend(handles=legend_entries, loc = 1, title=plot_title, fontsize = 'medium', fancybox = True)
ax.figure.savefig(outfile, bbox_inches='tight')
## Main
if __name__ == '__main__':
arguments = docopt(__doc__, version='v1.0 by Felipe Marques de Almeida')
## Check GFF
if arguments['check-gff'] and arguments['--input']:
check_gff(arguments['--input'])
## Single GFF
if arguments['--input'] and arguments['--start'] and arguments['--end'] and arguments['--contig']:
print("Executing the pipeline for a single GFF input")
single_gff(infile=arguments['--input'], start=arguments['--start'], end=arguments['--end'],
contig=arguments['--contig'], feature=arguments['--feature'], coloring=arguments['--color'],
custom_label=arguments['--label'], outfile=arguments['--output'], plot_title=arguments['--title'],
qualifier=arguments['--identification'], plot_width=arguments['--width'], plot_height=arguments['--height'])
print("Done, checkout the results in {}".format(arguments['--output']))
## Multiple GFFs
elif arguments['--fofn'] and arguments['--start'] and arguments['--end'] and arguments['--contig']:
print("Executing the pipeline for multiple GFF inputs")
multiple_gff(input_fofn=arguments['--fofn'], start=arguments['--start'], end=arguments['--end'],
contig=arguments['--contig'], feature=arguments['--feature'], outfile=arguments['--output'],
qualifier=arguments['--identification'], plot_title=arguments['--title'],
plot_width=arguments['--width'], plot_height=arguments['--height'])
print("Done, checkout the results in {}".format(arguments['--output']))
## None
else:
print("Missing mandatory arguments")
print("Please, check out the help message")
print("")
print(arguments)