@@ -96,7 +96,7 @@ def process_gtf(gtf_file): #make this into a seperate script that processes the
9696 """
9797
9898 print ("The gtf file you provided is " + gtf_file )
99- print ("Reading the gtf may take a while depending on the size of your gtf file " )
99+ print ("Reading the gtf may take a minute... " )
100100
101101 # calculate how long it takes to read gtf_file and report it
102102 start_time = time .time ()
@@ -146,7 +146,6 @@ def process_gtf(gtf_file): #make this into a seperate script that processes the
146146 gtf_exons_gr = gtf_exons_gr .drop_duplicate_positions (strand = True ) # Why are so many gone after this?
147147
148148 # Print the number of unique exons, transcript ids, and gene ids
149- print ("+++++++++++++++++++++++++++++++++++++++++++++++++++++++" )
150149 print ("The number of unique exons is " + str (len (gtf_exons_gr .exon_id .unique ())))
151150 print ("The number of unique transcript ids is " + str (len (gtf_exons_gr .transcript_id .unique ())))
152151 print ("The number of unique gene ids is " + str (len (gtf_exons_gr .gene_id .unique ())))
@@ -301,34 +300,46 @@ def mapping_juncs_exons(juncs_gr, gtf_exons_gr, singletons):
301300 else :
302301 return juncs_gr , juncs_coords_unique , clusters
303302
304- def visualize_junctions (dat , junc_id ):
303+ def basepair_to_kilobase (bp ):
304+ return bp / 1000 # Convert base pairs to kilobases
305305
306- #
306+ def visualize_junctions ( dat , junc_id ):
307307
308308 # Filter data for the specific junction ID
309309 dat = dat [dat .Cluster == dat [dat .junction_id == junc_id ].Cluster .values [0 ]]
310310
311311 # Get junctions
312- juncs = dat [["chromStart" , "chromEnd" , "strand" ]]
312+ juncs = dat [["chrom" , " chromStart" , "chromEnd" , "strand" , "intron_length" , "counts_total" , "thickStart" , "thickEnd " ]]
313313 juncs = juncs .drop_duplicates ()
314+ juncs ["junc_usage_ratio" ] = dat ["counts_total" ] / dat ["counts_total" ].sum ()
314315
315316 # Sort junctions based on strand
316317 if juncs .strand .values [0 ] == "+" :
317318 juncs = juncs .sort_values ("chromStart" )
318319 else :
319320 juncs = juncs .sort_values ("chromEnd" , ascending = False )
320321
322+ # Convert genomic coordinates to kilobases
323+ juncs ["chromStart_kb" ] = basepair_to_kilobase (juncs ["chromStart" ])
324+ juncs ["chromEnd_kb" ] = basepair_to_kilobase (juncs ["chromEnd" ])
325+ print (juncs )
326+ # Create colormap
327+ cmap = plt .get_cmap ('plasma' )
328+
321329 # Create the plot
322330 fig , ax = plt .subplots (figsize = (10 , len (juncs ) * 0.5 ))
323331
324- # Plot junctions as lines
332+ # Plot junctions as lines with varying colors based on usage ratio
325333 for i , (_ , junc ) in enumerate (juncs .iterrows ()):
326- ax .plot ([junc ["chromStart" ], junc ["chromEnd" ]], [i , i ], color = "red" )
334+ color = cmap (junc ["junc_usage_ratio" ])
335+ ax .plot ([junc ["chromStart_kb" ], junc ["chromEnd_kb" ]], [i , i ], color = color )
336+ ax .text (junc ["chromEnd_kb" ], i , f'{ junc ["junc_usage_ratio" ]:.2f} ' , verticalalignment = 'center' , fontsize = 8 , color = color )
327337
328- # Set labels and title
329- ax .set_xlabel ("Genomic Position" )
338+ # Set labels and title
339+ ax .set_xlabel ("Genomic Position on chr" + juncs . chrom . values [ 0 ] + " (" + juncs . strand . values [ 0 ] + ") [Kilobases] " )
330340 ax .set_yticks (list (range (len (juncs ))))
331341 ax .set_title (f"Visualization of Junctions in Cluster { dat .Cluster .values [0 ]} in the Gene { dat .gene_id .values [0 ]} " )
342+ ax .xaxis .set_major_formatter (plt .FuncFormatter (lambda x , _ : '{:.0f}' .format (x ))) # Disable scientific notation
332343 print ("The junction of interest is " + junc_id )
333344 plt .show ()
334345
@@ -464,10 +475,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
464475 # update juncs_gr to only include junctions that are part of clusters and update juncs_coords_unique to only include junctions that are part of clusters
465476 juncs_gr = juncs_gr [juncs_gr .junction_id .isin (clusters .junction_id )]
466477 juncs_coords_unique = juncs_coords_unique [juncs_coords_unique .junction_id .isin (clusters .junction_id )]
467- print (all_juncs .head ())
468- # update all_juncs
469- print (type (all_juncs ))
470- # all_juncs = all_juncs[all_juncs.junction_id.isin(juncs_coords_unique.junction_id)]
471478 print ("The number of clusters after removing singletons is " + str (len (clusters .Cluster .unique ())))
472479
473480 # 14. After re-clustering above, need to confirm that junctions still share splice sites
@@ -482,20 +489,21 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
482489
483490 if gtf_file is not None :
484491 clusts_unique = clusters .df [["Cluster" , "junction_id" , "gene_id" , "Count" ]].drop_duplicates ()
492+ juncs_gr = juncs_gr [["Chromosome" , "Start" , "End" , "Strand" , "junction_id" , "Start_b" , "End_b" , "gene_id" , "gene_name" , "transcript_id" , "exon_id" ]]
493+ juncs_gr = juncs_gr .drop_duplicate_positions ()
494+ juncs_gr .to_bed (junc_bed_file , chain = True ) #add option to add prefix to file name
495+ print ("Saved final list of junction coordinates to " + junc_bed_file )
485496 else :
486497 clusts_unique = clusters .df [["Cluster" , "junction_id" , "Count" ]].drop_duplicates ()
498+ juncs_gr = juncs_gr [["Chromosome" , "Start" , "End" , "Strand" , "junction_id" ]]
499+ juncs_gr = juncs_gr .drop_duplicate_positions ()
500+ juncs_gr .to_bed (junc_bed_file , chain = True )
501+ print ("Saved final list of junction coordinates to " + junc_bed_file )
487502
488503 # merge juncs_gr with corresponding cluster id
489504 all_juncs_df = all_juncs .merge (clusts_unique , how = "left" )
490505
491- # print the column names of all_juncs_df
492- print ("The columns in all_juncs_df are " + str (all_juncs_df .columns ))
493-
494506 # get final list of junction coordinates and save to bed file for visualization
495- juncs_gr = juncs_gr [["Chromosome" , "Start" , "End" , "Strand" , "junction_id" ]]
496- juncs_gr = juncs_gr .drop_duplicate_positions ()
497- juncs_gr .to_bed (junc_bed_file , chain = True ) #add option to add prefix to file name
498-
499507 print ("The number of clusters to be finally evaluated is " + str (len (all_juncs_df .Cluster .unique ())))
500508 print ("The number of junctions to be finally evaluated is " + str (len (all_juncs_df .junction_id .unique ())))
501509
0 commit comments