Skip to content

Commit a4a59e0

Browse files
committed
improve visualization
1 parent 4853ed3 commit a4a59e0

File tree

9 files changed

+432
-932
lines changed

9 files changed

+432
-932
lines changed

LeafletSC.egg-info/PKG-INFO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: LeafletSC
3-
Version: 0.2.16
3+
Version: 0.2.17
44
Summary: Alternative splicing quantification in single cells with Leaflet
55
Home-page: https://github.com/daklab/Leaflet
66
Author: Karin Isaev, Columbia University and NYGC

LeafletSC/clustering/find_intron_clusters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import concurrent.futures
1515
import matplotlib.pyplot as plt
1616
import seaborn as sns
17+
from matplotlib.lines import Line2D
18+
1719
warnings.filterwarnings("ignore", category=FutureWarning, module="pyranges")
1820

1921
parser = argparse.ArgumentParser(description='Read in file that lists junctions for all samples, \

Tutorials/01_run_intron_clustering.ipynb

Lines changed: 31 additions & 541 deletions
Large diffs are not rendered by default.

build/lib/LeafletSC/clustering/find_intron_clusters.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def process_gtf(gtf_file): #make this into a seperate script that processes the
9696
"""
9797

9898
print("The gtf file you provided is " + gtf_file)
99-
print("Reading the gtf may take a while depending on the size of your gtf file")
99+
print("Reading the gtf may take a minute...")
100100

101101
# calculate how long it takes to read gtf_file and report it
102102
start_time = time.time()
@@ -146,7 +146,6 @@ def process_gtf(gtf_file): #make this into a seperate script that processes the
146146
gtf_exons_gr = gtf_exons_gr.drop_duplicate_positions(strand=True) # Why are so many gone after this?
147147

148148
# Print the number of unique exons, transcript ids, and gene ids
149-
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")
150149
print("The number of unique exons is " + str(len(gtf_exons_gr.exon_id.unique())))
151150
print("The number of unique transcript ids is " + str(len(gtf_exons_gr.transcript_id.unique())))
152151
print("The number of unique gene ids is " + str(len(gtf_exons_gr.gene_id.unique())))
@@ -301,34 +300,46 @@ def mapping_juncs_exons(juncs_gr, gtf_exons_gr, singletons):
301300
else:
302301
return juncs_gr, juncs_coords_unique, clusters
303302

304-
def visualize_junctions(dat, junc_id):
303+
def basepair_to_kilobase(bp):
304+
return bp / 1000 # Convert base pairs to kilobases
305305

306-
#
306+
def visualize_junctions(dat, junc_id):
307307

308308
# Filter data for the specific junction ID
309309
dat = dat[dat.Cluster == dat[dat.junction_id == junc_id].Cluster.values[0]]
310310

311311
# Get junctions
312-
juncs = dat[["chromStart", "chromEnd", "strand"]]
312+
juncs = dat[["chrom", "chromStart", "chromEnd", "strand", "intron_length", "counts_total", "thickStart", "thickEnd"]]
313313
juncs = juncs.drop_duplicates()
314+
juncs["junc_usage_ratio"] = dat["counts_total"] / dat["counts_total"].sum()
314315

315316
# Sort junctions based on strand
316317
if juncs.strand.values[0] == "+":
317318
juncs = juncs.sort_values("chromStart")
318319
else:
319320
juncs = juncs.sort_values("chromEnd", ascending=False)
320321

322+
# Convert genomic coordinates to kilobases
323+
juncs["chromStart_kb"] = basepair_to_kilobase(juncs["chromStart"])
324+
juncs["chromEnd_kb"] = basepair_to_kilobase(juncs["chromEnd"])
325+
print(juncs)
326+
# Create colormap
327+
cmap = plt.get_cmap('plasma')
328+
321329
# Create the plot
322330
fig, ax = plt.subplots(figsize=(10, len(juncs) * 0.5))
323331

324-
# Plot junctions as lines
332+
# Plot junctions as lines with varying colors based on usage ratio
325333
for i, (_, junc) in enumerate(juncs.iterrows()):
326-
ax.plot([junc["chromStart"], junc["chromEnd"]], [i, i], color="red")
334+
color = cmap(junc["junc_usage_ratio"])
335+
ax.plot([junc["chromStart_kb"], junc["chromEnd_kb"]], [i, i], color=color)
336+
ax.text(junc["chromEnd_kb"], i, f'{junc["junc_usage_ratio"]:.2f}', verticalalignment='center', fontsize=8, color=color)
327337

328-
# Set labels and title
329-
ax.set_xlabel("Genomic Position")
338+
# Set labels and title
339+
ax.set_xlabel("Genomic Position on chr" + juncs.chrom.values[0] + " (" + juncs.strand.values[0] + ") [Kilobases]")
330340
ax.set_yticks(list(range(len(juncs))))
331341
ax.set_title(f"Visualization of Junctions in Cluster {dat.Cluster.values[0]} in the Gene {dat.gene_id.values[0]}")
342+
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '{:.0f}'.format(x))) # Disable scientific notation
332343
print("The junction of interest is " + junc_id)
333344
plt.show()
334345

@@ -464,10 +475,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
464475
# update juncs_gr to only include junctions that are part of clusters and update juncs_coords_unique to only include junctions that are part of clusters
465476
juncs_gr = juncs_gr[juncs_gr.junction_id.isin(clusters.junction_id)]
466477
juncs_coords_unique = juncs_coords_unique[juncs_coords_unique.junction_id.isin(clusters.junction_id)]
467-
print(all_juncs.head())
468-
# update all_juncs
469-
print(type(all_juncs))
470-
# all_juncs = all_juncs[all_juncs.junction_id.isin(juncs_coords_unique.junction_id)]
471478
print("The number of clusters after removing singletons is " + str(len(clusters.Cluster.unique())))
472479

473480
# 14. After re-clustering above, need to confirm that junctions still share splice sites
@@ -482,20 +489,21 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
482489

483490
if gtf_file is not None:
484491
clusts_unique = clusters.df[["Cluster", "junction_id", "gene_id", "Count"]].drop_duplicates()
492+
juncs_gr = juncs_gr[["Chromosome", "Start", "End", "Strand", "junction_id", "Start_b", "End_b", "gene_id", "gene_name", "transcript_id", "exon_id"]]
493+
juncs_gr = juncs_gr.drop_duplicate_positions()
494+
juncs_gr.to_bed(junc_bed_file, chain=True) #add option to add prefix to file name
495+
print("Saved final list of junction coordinates to " + junc_bed_file)
485496
else:
486497
clusts_unique = clusters.df[["Cluster", "junction_id", "Count"]].drop_duplicates()
498+
juncs_gr = juncs_gr[["Chromosome", "Start", "End", "Strand", "junction_id"]]
499+
juncs_gr = juncs_gr.drop_duplicate_positions()
500+
juncs_gr.to_bed(junc_bed_file, chain=True)
501+
print("Saved final list of junction coordinates to " + junc_bed_file)
487502

488503
# merge juncs_gr with corresponding cluster id
489504
all_juncs_df = all_juncs.merge(clusts_unique, how="left")
490505

491-
# print the column names of all_juncs_df
492-
print("The columns in all_juncs_df are " + str(all_juncs_df.columns))
493-
494506
# get final list of junction coordinates and save to bed file for visualization
495-
juncs_gr = juncs_gr[["Chromosome", "Start", "End", "Strand", "junction_id"]]
496-
juncs_gr = juncs_gr.drop_duplicate_positions()
497-
juncs_gr.to_bed(junc_bed_file, chain=True) #add option to add prefix to file name
498-
499507
print("The number of clusters to be finally evaluated is " + str(len(all_juncs_df.Cluster.unique())))
500508
print("The number of junctions to be finally evaluated is " + str(len(all_juncs_df.junction_id.unique())))
501509

Binary file not shown.

0 commit comments

Comments
 (0)