3535 "GGU" :"Gly" , "GGC" :"Gly" , "GGA" :"Gly" , "GGG" :"Gly" ,}
3636
3737
38- def get_cod_freq (gene ,ID ) :
38+ def get_cod_freq (cds : str ,ID : str ) -> pd . DataFrame :
3939 "computes absolute codon counts in the input gene coding sequence"
40- codon_count = dict ()
4140
4241 #ignore 1-fold Met and Trp, along with stop codons
4342 non_deg = ['AUG' , "UAA" ,"UAG" , "UGA" , "UGG" ]
44-
45- for codon in list (codon_aa .keys ()):
46- if codon not in non_deg :
47- codon_count [codon ]= 0
48- #make a list of codons
49- codons = []
50- for c in range (0 ,len (gene ),3 ):
51- cod = gene [c :c + 3 ]
52- if 'N' not in cod : ##ignore N
53- codons .append (cod )
54- else :
55- continue
56- for c in list (codon_count .keys ()):
57- codon_count [c ]+= codons .count (c )
43+ codon_count = dict ()
44+ codon_count = {codon : 0 for codon in codon_aa if codon not in non_deg }
45+ ##count codons in cds
46+ for i in range (0 , len (cds ), 3 ):
47+ codon = cds [i :i + 3 ]
48+ if codon in codon_count :
49+ codon_count [codon ] += 1
5850
5951 df_codcnt = pd .DataFrame (list (codon_count .items ()) )
6052 df_codcnt .columns = ['Codon' , 'Obs_Freq' ]
6153 df_codcnt ['Amino_acid' ] = [codon_aa [codon ] for codon in df_codcnt ['Codon' ].values ]
62- df_codcnt ['Length' ]= len (gene )
54+ df_codcnt ['Length' ]= len (cds )
6355 df_codcnt ['SeqID' ]= ID
6456 return df_codcnt
6557
6658#compute relative usagae from codon frequency
67- def compute_rscu_weights (df_codcnt ) :
68- """ wij = RSCUij/ RSCU i,max """
59+ def compute_rscu_weights (df_codcnt : pd . DataFrame ) -> pd . DataFrame :
60+ """Computes the RSCU of the CDS """
6961 aa_groups = df_codcnt .groupby ('Amino_acid' )
7062 aa = df_codcnt ['Amino_acid' ].unique () #make a list of all amino acids to iterate over
7163 df_list = []
7264 for a in aa :
7365 d = aa_groups .get_group (a )
74- d ['RSCU' ] = d ['Obs_Freq' ].values / d ['Obs_Freq' ].mean () #obs/expected freq,
66+ if d ['Obs_Freq' ].mean () == 0.0 :
67+ d ['RSCU' ] = 0.0
68+ else :
69+ d ['RSCU' ] = d ['Obs_Freq' ].values / d ['Obs_Freq' ].mean () #obs/expected freq,
7570 d ['AA-Codon' ]= d ['Amino_acid' ]+ '-' + d ['Codon' ]
7671 df_list .append (d )
7772 rscu = pd .concat (df_list ).fillna (0 ) #some genomes may not use any amino acids
78- return rscu # rscu of a gene
73+ return rscu
7974
8075
8176if __name__ == '__main__' :
@@ -90,15 +85,15 @@ def compute_rscu_weights(df_codcnt):
9085
9186
9287 headers ,seqs = fix_fasta .fix_fasta (args .CDS )##preprocess fasta to a paired list of headers and sequences
88+
9389 df_list = []
9490 for i in range (len (seqs )):
95- gene = seqs [i ].replace ('T' ,'U' ).upper ()
96- ID = headers [i ]
97- if len (gene )% 3 != 0 :
98- print ('Gene { } not multiple of 3'. format ( ID ) )
91+ cds = seqs [i ].replace ('T' ,'U' ).upper ()
92+ ID = headers [i ]. split ( ' ' )[ 0 ]
93+ if len (cds )% 3 != 0 :
94+ print ( f'WARNING! Skipping CDS { ID } not multiple of 3' )
9995 continue
100-
101- df_codcnt = get_cod_freq (gene ,ID )
96+ df_codcnt = get_cod_freq (cds ,ID )
10297 df_codcnt .to_csv ('df_count2.csv' ,index = False )
10398 rscu = compute_rscu_weights (df_codcnt )
10499 df_list .append (rscu ) ## append RSCU matrix of each gene
@@ -109,12 +104,9 @@ def compute_rscu_weights(df_codcnt):
109104 for rscu in df_list : #rname rscu_list
110105 r3 = rscu .set_index ('AA-Codon' ).drop (omit , axis = 1 ).T
111106 r3 .reset_index (drop = True )
112-
113107 ##add a gene information column
114108 r3 ['SeqID' ]= rscu ['SeqID' ].values [0 ]
115109 r3 ['Length' ]= rscu ['Length' ].values [0 ]
116110
117111 comb .append (r3 )
118-
119-
120112 pd .concat (comb ,axis = 0 ).reset_index (drop = True ).to_csv (args .out + '_rscu.csv' ,index = False )
0 commit comments