Skip to content

Commit b11be72

Browse files
committed
optimized and resolved div by zero error
1 parent c8fa033 commit b11be72

File tree

3 files changed

+635
-30
lines changed

3 files changed

+635
-30
lines changed
33.3 KB
Binary file not shown.

Compute_RSCU_gene/__main__.py

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -35,47 +35,42 @@
3535
"GGU":"Gly", "GGC":"Gly", "GGA":"Gly", "GGG":"Gly",}
3636

3737

38-
def get_cod_freq(gene,ID):
38+
def get_cod_freq(cds:str,ID:str)->pd.DataFrame:
3939
"computes absolute codon counts in the input gene coding sequence"
40-
codon_count=dict()
4140

4241
#ignore 1-fold Met and Trp, along with stop codons
4342
non_deg=['AUG', "UAA","UAG", "UGA", "UGG" ]
44-
45-
for codon in list(codon_aa.keys()):
46-
if codon not in non_deg :
47-
codon_count[codon]=0
48-
#make a list of codons
49-
codons=[]
50-
for c in range(0,len(gene),3):
51-
cod=gene[c:c+3]
52-
if 'N' not in cod: ##ignore N
53-
codons.append(cod)
54-
else:
55-
continue
56-
for c in list(codon_count.keys()):
57-
codon_count[c]+= codons.count(c)
43+
codon_count=dict()
44+
codon_count = {codon: 0 for codon in codon_aa if codon not in non_deg }
45+
##count codons in cds
46+
for i in range(0, len(cds), 3):
47+
codon = cds[i:i+3]
48+
if codon in codon_count:
49+
codon_count[codon] += 1
5850

5951
df_codcnt=pd.DataFrame(list(codon_count.items()) )
6052
df_codcnt.columns=['Codon', 'Obs_Freq']
6153
df_codcnt['Amino_acid'] = [codon_aa[codon] for codon in df_codcnt['Codon'].values]
62-
df_codcnt['Length']= len(gene)
54+
df_codcnt['Length']= len(cds)
6355
df_codcnt['SeqID']= ID
6456
return df_codcnt
6557

6658
#compute relative usagae from codon frequency
67-
def compute_rscu_weights(df_codcnt):
68-
""" wij = RSCUij/ RSCU i,max"""
59+
def compute_rscu_weights(df_codcnt:pd.DataFrame)->pd.DataFrame:
60+
"""Computes the RSCU of the CDS """
6961
aa_groups = df_codcnt.groupby('Amino_acid')
7062
aa = df_codcnt['Amino_acid'].unique() #make a list of all amino acids to iterate over
7163
df_list = []
7264
for a in aa:
7365
d=aa_groups.get_group(a)
74-
d['RSCU'] = d['Obs_Freq'].values/d['Obs_Freq'].mean() #obs/expected freq,
66+
if d['Obs_Freq'].mean() == 0.0:
67+
d['RSCU'] = 0.0
68+
else:
69+
d['RSCU'] = d['Obs_Freq'].values/d['Obs_Freq'].mean() #obs/expected freq,
7570
d['AA-Codon']=d['Amino_acid']+'-'+d['Codon']
7671
df_list.append(d)
7772
rscu = pd.concat(df_list).fillna(0) #some genomes may not use any amino acids
78-
return rscu # rscu of a gene
73+
return rscu
7974

8075

8176
if __name__=='__main__':
@@ -90,15 +85,15 @@ def compute_rscu_weights(df_codcnt):
9085

9186

9287
headers,seqs=fix_fasta.fix_fasta(args.CDS)##preprocess fasta to a paired list of headers and sequences
88+
9389
df_list=[]
9490
for i in range(len(seqs)):
95-
gene = seqs[i].replace('T','U').upper()
96-
ID = headers[i]
97-
if len(gene)%3 !=0:
98-
print('Gene {} not multiple of 3'.format(ID))
91+
cds= seqs[i].replace('T','U').upper()
92+
ID = headers[i].split(' ')[0]
93+
if len(cds)%3 !=0:
94+
print( f'WARNING! Skipping CDS {ID} not multiple of 3')
9995
continue
100-
101-
df_codcnt = get_cod_freq(gene,ID)
96+
df_codcnt = get_cod_freq(cds,ID)
10297
df_codcnt.to_csv('df_count2.csv',index=False)
10398
rscu = compute_rscu_weights(df_codcnt)
10499
df_list.append(rscu) ## append RSCU matrix of each gene
@@ -109,12 +104,9 @@ def compute_rscu_weights(df_codcnt):
109104
for rscu in df_list: #rname rscu_list
110105
r3 = rscu.set_index('AA-Codon').drop(omit, axis=1).T
111106
r3.reset_index(drop=True)
112-
113107
##add a gene information column
114108
r3['SeqID']= rscu['SeqID'].values[0]
115109
r3['Length']=rscu['Length'].values[0]
116110

117111
comb.append(r3)
118-
119-
120112
pd.concat(comb,axis=0).reset_index(drop=True).to_csv(args.out+'_rscu.csv',index=False)

0 commit comments

Comments
 (0)