daklab
diff --git a/‎LeafletSC.egg-info/PKG-INFO‎
Lines changed: 43 additions & 20 deletions b/‎LeafletSC.egg-info/PKG-INFO‎
Lines changed: 43 additions & 20 deletions
diff --git a/‎LeafletSC.egg-info/SOURCES.txt‎
Lines changed: 3 additions & 3 deletions b/‎LeafletSC.egg-info/SOURCES.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎LeafletSC.egg-info/requires.txt‎
Lines changed: 13 additions & 12 deletions b/‎LeafletSC.egg-info/requires.txt‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎…/LeafletSC/beta-binomial-mix/__init__.py‎ ‎…/LeafletSC/beta_binomial_mix/__init__.py‎build/lib/LeafletSC/beta-binomial-mix/__init__.py renamed to build/lib/LeafletSC/beta_binomial_mix/__init__.py b/‎…/LeafletSC/beta-binomial-mix/__init__.py‎ ‎…/LeafletSC/beta_binomial_mix/__init__.py‎build/lib/LeafletSC/beta-binomial-mix/__init__.py renamed to build/lib/LeafletSC/beta_binomial_mix/__init__.py
diff --git a/‎…ta-binomial-mix/cellstate_consistency.py‎ ‎…ta_binomial_mix/cellstate_consistency.py‎build/lib/LeafletSC/beta-binomial-mix/cellstate_consistency.py renamed to build/lib/LeafletSC/beta_binomial_mix/cellstate_consistency.py b/‎…ta-binomial-mix/cellstate_consistency.py‎ ‎…ta_binomial_mix/cellstate_consistency.py‎build/lib/LeafletSC/beta-binomial-mix/cellstate_consistency.py renamed to build/lib/LeafletSC/beta_binomial_mix/cellstate_consistency.py
diff --git a/‎…lib/LeafletSC/beta-binomial-mix/model.py‎ ‎…lib/LeafletSC/beta_binomial_mix/model.py‎build/lib/LeafletSC/beta-binomial-mix/model.py renamed to build/lib/LeafletSC/beta_binomial_mix/model.py b/‎…lib/LeafletSC/beta-binomial-mix/model.py‎ ‎…lib/LeafletSC/beta_binomial_mix/model.py‎build/lib/LeafletSC/beta-binomial-mix/model.py renamed to build/lib/LeafletSC/beta_binomial_mix/model.py
diff --git a/‎build/lib/LeafletSC/clustering/obtain_intron_clusters.py‎
Lines changed: 8 additions & 31 deletions b/‎build/lib/LeafletSC/clustering/obtain_intron_clusters.py‎
Lines changed: 8 additions & 31 deletions
diff --git a/‎build/lib/LeafletSC/clustering/prep_model_input.py‎
Lines changed: 19 additions & 14 deletions b/‎build/lib/LeafletSC/clustering/prep_model_input.py‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎dist/LeafletSC-0.1.2.tar.gz‎
-23.2 KB b/‎dist/LeafletSC-0.1.2.tar.gz‎
-23.2 KB
diff --git a/‎dist/LeafletSC-0.1.2-py3-none-any.whl‎ ‎dist/LeafletSC-0.1.4-py3-none-any.whl‎dist/LeafletSC-0.1.2-py3-none-any.whl renamed to dist/LeafletSC-0.1.4-py3-none-any.whl
27.9 KB b/‎dist/LeafletSC-0.1.2-py3-none-any.whl‎ ‎dist/LeafletSC-0.1.4-py3-none-any.whl‎dist/LeafletSC-0.1.2-py3-none-any.whl renamed to dist/LeafletSC-0.1.4-py3-none-any.whl
27.9 KB
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: LeafletSC
-Version: 0.1.2
+Version: 0.1.4
 Summary: Alternative splicing quantification in single cells with Leaflet
 Home-page: https://github.com/daklab/Leaflet
 Author: Karin Isaev, Columbia University and NYGC
@@ -12,26 +12,27 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9.15
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: gtfparse==1.3.0
-Requires-Dist: matplotlib==3.7.1
-Requires-Dist: numpy==1.23.5
-Requires-Dist: pandas==1.5.3
+Requires-Dist: gtfparse==2.5.0
+Requires-Dist: matplotlib
+Requires-Dist: numpy
+Requires-Dist: pandas
 Requires-Dist: pyranges==0.0.129
-Requires-Dist: scanpy==1.9.8
-Requires-Dist: scikit_learn==1.2.2
-Requires-Dist: scipy==1.10.1
-Requires-Dist: seaborn==0.13.2
-Requires-Dist: setuptools==69.1.1
-Requires-Dist: torch==2.2.1
-Requires-Dist: tqdm==4.66.2
-Requires-Dist: umap_learn==0.5.3
+Requires-Dist: scanpy
+Requires-Dist: scikit_learn
+Requires-Dist: scipy
+Requires-Dist: seaborn
+Requires-Dist: setuptools
+Requires-Dist: torch==1.12.1
+Requires-Dist: tqdm
+Requires-Dist: umap
+Requires-Dist: tables==3.4.4
 
 # LeafletSC
 
 LeafletSC is a binomial mixture model designed for the analysis of alternative splicing events in single-cell RNA sequencing data. The model facilitates understanding and quantifying splicing variability at the single-cell level. Below is the graphical model representation:
 
 <p align="center">
-  <img src="https://github.com/daklab/Leaflet/assets/23510936/3e147ba5-7ee8-47ae-b84c-5e99e0551acf" width="500">
+  <img src="https://github.com/daklab/Leaflet/assets/23510936/2c7981fe-91ec-4830-b010-b74ac4140940">
 </p>
 
 ## Compatibility with sequencing platforms 
@@ -42,29 +43,50 @@ LeafletSC supports analysis from the following single-cell RNA sequencing platfo
 
 ## Getting Started
 
-LeafletSC is implemented in Python and requires Python version 3.9 or higher. You can easily install LeafletSC via PyPI using the following command:
+LeafletSC is implemented in Python and requires Python version 3.9 or higher. We recommend the following approach:
 
 ```bash
-pip install LeafletSC
+# create a conda environment with python 3.9 
+conda create -n "LeafletSC" python=3.9.15 ipython
+# activate environment 
+conda activate LeafletSC
+# install latest version of LeafletSC into this environment
+pip install LeafletSC==0.1.2
 ```
 
-Please also make sure you have regtools installed. Prior to using LeafletSC, please run regtools on your single-cell BAM files. Here is an example of what this might look like in a Snakefile:
+Once the package is installed, you can load it in python as follows:
+```python
+import LeafletSC 
+
+# or specific submodules 
+from LeafletSC.utils import *
+from LeafletSC.clustering import *
+```
+
+## Requirements 
+Prior to using LeafletSC, please run **regtools** on your single-cell BAM files. Here is an example of what this might look like in a Snakefile:
 
 ```Snakemake
 {params.regtools_path} junctions extract -a 6 -m 50 -M 500000 {input.bam_use} -o {output.juncs} -s XS -b {output.barcodes}
 # Combine junctions and cell barcodes
 paste --delimiters='\t' {output.juncs} {output.barcodes} > {output.juncswbarcodes}
 ```
-
-Once you have your junction files, you can try out the mixture model tutorial under [Tutorials](Tutorials/run_binomial_mixture_model.ipynb)
+- Once you have your junction files, you can try out the mixture model tutorial under [Tutorials](Tutorials/run_binomial_mixture_model.ipynb)
+- While optional, we recommend running LeafletSC intron clustering with a gtf file so that junctions can be first mapped to annotated splicing events. 
 
 ## Capabilities
 With LeafletSC, you can:
 
-- Infer cell states influenced by alternative splicing and identify significant splice junctions.
+- Infer cell states influenced by alternative splicing and identify differentially spliced regions.
 - Conduct differential splicing analysis between specific cell groups if cell identities are known.
 - Generate synthetic alternative splicing datasets for robust analysis testing.
 
+## How does it work? 
+The full method can be found in our [paper](https://www.biorxiv.org/content/10.1101/2023.10.17.562774v3) while the graphical model is shown below:
+<p align="center">
+  <img src="https://github.com/daklab/Leaflet/assets/23510936/3e147ba5-7ee8-47ae-b84c-5e99e0551acf">
+</p>
+
 ## If you use Leaflet, please cite our [paper](https://www.biorxiv.org/content/10.1101/2023.10.17.562774v3)
 
 ```
@@ -85,3 +107,4 @@ With LeafletSC, you can:
 2. Add 10X/split-seq mode in addition to smart-seq2
 3. Extend framework to seurat/scanpy anndata objects
 4. Add notes on generative model and inference method
+5. Clean up dependencies 
@@ -7,9 +7,9 @@ LeafletSC.egg-info/SOURCES.txt
 LeafletSC.egg-info/dependency_links.txt
 LeafletSC.egg-info/requires.txt
 LeafletSC.egg-info/top_level.txt
-LeafletSC/beta-binomial-mix/__init__.py
-LeafletSC/beta-binomial-mix/cellstate_consistency.py
-LeafletSC/beta-binomial-mix/model.py
+LeafletSC/beta_binomial_mix/__init__.py
+LeafletSC/beta_binomial_mix/cellstate_consistency.py
+LeafletSC/beta_binomial_mix/model.py
 LeafletSC/clustering/__init__.py
 LeafletSC/clustering/load_cluster_data.py
 LeafletSC/clustering/obtain_intron_clusters.py
 
@@ -1,13 +1,14 @@
-gtfparse==1.3.0
-matplotlib==3.7.1
-numpy==1.23.5
-pandas==1.5.3
+gtfparse==2.5.0
+matplotlib
+numpy
+pandas
 pyranges==0.0.129
-scanpy==1.9.8
-scikit_learn==1.2.2
-scipy==1.10.1
-seaborn==0.13.2
-setuptools==69.1.1
-torch==2.2.1
-tqdm==4.66.2
-umap_learn==0.5.3
+scanpy
+scikit_learn
+scipy
+seaborn
+setuptools
+torch==1.12.1
+tqdm
+umap
+tables==3.4.4
@@ -74,13 +74,8 @@
                     default="no",
                     help='yes if want to remove lowly used junctions in clusters, default is no')
 
-parser.add_argument('--strict_filter', dest='strict_filter',
-                    default=True,
-                    help='default is True, this means that only clusters with less junctions that the mean \
-                        junction count per cluster is included. This is meant to remove very complex \
-                        splicing events that might be hard to make sense of in the single cell context especially.')
-
-args = parser.parse_args()
+#args = parser.parse_args()
+args = parser.parse_args(args=[])
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++
 #                      Utilities
@@ -89,7 +84,6 @@
 def process_gtf(gtf_file): #make this into a seperate script that processes the gtf file into gr object that can be used in the main scriptas input 
 
     print("The gtf file you provided is " + gtf_file)
-    print("Now reading gtf file using gtfparse")
     print("This step may take a while depending on the size of your gtf file")
 
     # calculate how long it takes to read gtf_file and report it 
@@ -129,9 +123,11 @@ def process_gtf(gtf_file): #make this into a seperate script that processes the
     gtf_exons_gr = gtf_exons_gr.drop_duplicate_positions(strand=True) # Why are so many gone after this? 
 
     # Print the number of unique exons, transcript ids, and gene ids
+    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")
     print("The number of unique exons is " + str(len(gtf_exons_gr.exon_id.unique())))
     print("The number of unique transcript ids is " + str(len(gtf_exons_gr.transcript_id.unique())))
     print("The number of unique gene ids is " + str(len(gtf_exons_gr.gene_id.unique())))
+    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")
     return(gtf_exons_gr)
 
 def filter_junctions_by_shared_splice_sites(df):
@@ -153,7 +149,7 @@ def filter_group(group):
 #        Run analysis and obtain intron clusters
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, threshold_inc, min_intron, max_intron, min_junc_reads, singleton, strict_filter, junc_suffix, min_num_cells_wjunc, filter_low_juncratios_inclust):
+def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, threshold_inc, min_intron, max_intron, min_junc_reads, singleton, junc_suffix, min_num_cells_wjunc, filter_low_juncratios_inclust):
 
     #1. Check format of junc_files and convert to list if necessary
     # Can either be a list of folders with junction files or a single folder with junction files
@@ -170,10 +166,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
     # 3. Process each path
     for junc_path in junc_files:
 
-        # make sure junc_path has "/" at the end
-        #if not junc_path.endswith("/"):
-        #    junc_path = junc_path + "/"
-        
         junc_path = Path(junc_path)
         print(f"Reading in junction files from {junc_path}")
 
@@ -183,11 +175,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
             print(f"No junction files found in {junc_path} with suffix {junc_suffix}")
             continue
 
-        #junc_files_in_path = glob.glob(junc_path + "*" + junc_suffix)  # Adjusted to correctly form the glob pattern
-        #if not junc_files_in_path:
-        #    print(f"No junction files found in {junc_path} with suffix {junc_suffix}")
-        #    continue
-        
         print(f"The number of regtools junction files to be processed is {len(junc_files_in_path)}")
 
         files_not_read = []
@@ -197,14 +184,14 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
             try:
                 juncs = pd.read_csv(junc_file, sep="\t", header=None)
                 juncs['file_name'] = junc_file  # Add the file name as a new column
-                #juncs['cell_type'] = junc_file.split("/")[-1]
                 juncs['cell_type'] = junc_file
                 all_juncs_list.append(juncs)  # Append the DataFrame to the list
             except Exception as e:
                 print(f"Could not read in {junc_file}: {e}")  
                 files_not_read.append(junc_file)
 
-    print("The total number of files that could not be read is " + str(len(files_not_read)) + " as these had no junctions")
+    if(len(files_not_read) > 0):
+        print("The total number of files that could not be read is " + str(len(files_not_read)) + " as these had no junctions")
 
     # 5. Concatenate all DataFrames into a single DataFrame
     all_juncs = pd.concat(all_juncs_list, ignore_index=True) if all_juncs_list else pd.DataFrame()
@@ -247,7 +234,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
     all_juncs["intron_length"] = all_juncs["chromEnd"] - all_juncs["chromStart"]
     mask = (all_juncs["intron_length"] >= min_intron) & (all_juncs["intron_length"] <= max_intron)
     all_juncs = all_juncs[mask]
-    print("Filtering based on intron length")
 
     # Filter for 'chrom' column to handle "chr" prefix
     all_juncs = all_juncs.copy()
@@ -264,7 +250,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
     all_juncs['junction_id'] = all_juncs['chrom'] + '_' + all_juncs['chromStart'].astype(str) + '_' + all_juncs['chromEnd'].astype(str)
 
     # Get total score for each junction and merge with all_juncs with new column "total_counts"
-    
     all_juncs = all_juncs.groupby('junction_id').agg({'score': 'sum'}).reset_index().merge(all_juncs, on='junction_id', how='left')
 
     # rename score_x and score_y to total_junc_counts and score 
@@ -319,7 +304,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
 
     # 9. if singleton is False, remove clusters with only one junction
     if singleton == False:
-        print(clusters.Count.value_counts())
         clusters = clusters[clusters.Count > 1]
         print("The number of clusters after removing singletons is " + str(len(clusters.Cluster.unique())))
 
@@ -349,7 +333,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
 
     # check if any clusters are singletons now and remove if have singleton == False
     if singleton == False:
-        print(filtered_clusters_df.Count.value_counts())
         filtered_clusters_df = filtered_clusters_df[filtered_clusters_df.Count > 1]
         print("The number of clusters after removing singletons is " + str(len(filtered_clusters_df.Cluster.unique())))
 
@@ -411,11 +394,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
         singleton=True
     else:
         singleton=False
-    # ensure strict_filter is boolean
-    if args.strict_filter == "True":
-        strict_filter=True
-    else:
-        strict_filter=False
 
     # print out all user defined arguments that were chosen 
     print("The following arguments were chosen:" , flush=True)
@@ -431,7 +409,6 @@ def main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, thre
     print("junc_suffix: " + junc_suffix, flush=True)
     print("min_num_cells_wjunc: " + str(min_num_cells_wjunc), flush=True)
     print("singleton: " + str(singleton), flush=True)
-    print("strict_filter: " + str(strict_filter), flush=True)
     print("filter_low_juncratios_inclust: " + (filter_low_juncratios_inclust), flush=True)
 
-    main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, threshold_inc, min_intron, max_intron, min_junc_reads, singleton, strict_filter, junc_suffix, min_num_cells_wjunc, filter_low_juncratios_inclust)
+    main(junc_files, gtf_file, output_file, sequencing_type, junc_bed_file, threshold_inc, min_intron, max_intron, min_junc_reads, singleton, junc_suffix, min_num_cells_wjunc, filter_low_juncratios_inclust)
@@ -5,26 +5,34 @@
 from tqdm import tqdm
 import concurrent.futures
 import time
+import tables  
 
-pd.options.mode.chained_assignment = None  # default='warn'
-
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.core.strings")
+#pd.options.mode.chained_assignment = None  # default='warn'
+#import warnings
+#warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.core.strings")
 
 parser = argparse.ArgumentParser(description='Read in file that lists junctions for all samples, one file per line and no header')
 
 parser.add_argument('--intron_clusters', dest='intron_clusters',
                     help='path to the file that has the intron cluster events and junction information from running intron_clustering.py')
-parser.add_argument('--output_file', dest='output_file',
+
+parser.add_argument('--output_file', dest='output_file', 
+                    default="output_file",
                     help='how you want to name the output file, this will be the input for all Leaflet models')
+
 parser.add_argument('--has_genes', dest='has_genes',
+                    default="no",
                     help='yes if intron clustering was done with a gtf file, No if intron clustering was done in an annotation free manner')
-parser.add_argument('--chunk_size', dest='chunk_size', default=5000,
+
+parser.add_argument('--chunk_size', dest='chunk_size', 
+                    default=5000,
                     help='how many lines to read in at a time, default is 5000')
+
 parser.add_argument('--metadata', dest='metadata',
                     default=None,
                     help='path to the metadata file, if provided, the output file will have cell type information')
-args = parser.parse_args()
+
+args, unknown = parser.parse_known_args()
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++
 #                      Utilities
@@ -125,17 +133,11 @@ def main(intron_clusters, output_file, has_genes, chunk_size, metadata):
     print("The number of total cells evaluated is " + str(len(all_cells))) 
 
     cells_types = clusts[["cell_type", "cell_id"]].drop_duplicates()
-    print(clusts.head())
     print("The number of cells per cell type is:")
     print(cells_types.groupby(["cell_type"])["cell_type"].count())
 
-    print("Ensuring that each cell-junction pair appears only once")
     summarized_data = summarized_data.drop_duplicates(subset=['cell_id', 'junction_id'], keep='last') #double check if this is still necessary
-
-    print("Merge cluster counts with summarized data")
-
     summarized_data = clust_cell_counts.merge(summarized_data)
-    print("Done merging cluster counts with summarized data")
 
     print(np.unique(summarized_data['cell_id'].values))
     summarized_data["junc_ratio"] = summarized_data["junc_count"] / summarized_data["Cluster_Counts"]
@@ -152,15 +154,18 @@ def main(intron_clusters, output_file, has_genes, chunk_size, metadata):
             # if "/" detected in name (cell_type) replace it with "_"
             if "/" in name:
                 name = name.replace("/", "_")
-            print("saving " + name + " as hdf file")
             group.to_hdf(output_file + "_" + name + ".h5", key='df', mode='w', complevel=9, complib='zlib')
+            print("You can find the resulting file at " + output_file + "_" + name + ".h5")
 
     if metadata is None:
         # save summarized_data as hdf file
         summarized_data.to_hdf(output_file + ".h5", key='df', mode='w', complevel=9, complib='zlib')    
+        print("You can find the resulting file at " + output_file + ".h5")
+
     print("Done generating input file for Leaflet model. This process took " + str(round(time.time() - start_time)) + " seconds")
 
 if __name__ == '__main__':
+
     intron_clusters=args.intron_clusters
     output_file=args.output_file
     has_genes=args.has_genes