Trinity best transcript set

Method for running Trinity, then processing results with TransDecoder and InterProscan, then reducing the sequences to a "best" set using cd-hit and some scripts.

#external software  references:
#   https://sourceforge.net/projects/lemonade-assemble/
#      for a tar.gz including all of the following, or
#      download one at a time:
#      (Next three do: 'grep gcc fasta*.c' for compile commands)
#      http://saf.bio.caltech.edu/pub/software/molbio/fastaselecth.c
#      http://saf.bio.caltech.edu/pub/software/molbio/fastasplitn.c
#      http://saf.bio.caltech.edu/pub/software/molbio/fastaproperties.c
#      (next four, check internal paths to needed programs before running)
#      http://saf.bio.caltech.edu/pub/software/molbio/many_blast_plus_1cpu.sh
#      http://saf.bio.caltech.edu/pub/software/molbio/many_hmmscan_1cpu.sh
#      http://saf.bio.caltech.edu/pub/software/molbio/trinity_cdhit_feeder.pl
#      http://saf.bio.caltech.edu/pub/software/linux_or_unix_tools/match_many.pl
#   (next is for "extract", a small utility used below)
#   https://sourceforge.net/projects/drmtools/
#   (programs from elsewhere, versions used are indicated)
#   NCBI toolbox+ (ncbi-blast-2.7.1+-1.x86_64)
#   Hmmer  3.1b1
#   Trinity 2.8.3
#   TransDecoder v5.3.0
#   InterProscan 5.30-69.0 
#   cdhit-2017_10_18
#   jellyfish 2.2.6
#   bowtie2 2.3.4.2
#path includes jellyfish-2.2.6 and bowtie2 2.3.4.2
#TRINITY_PATH is to Trinity directory
#TRANSDECODER_PATH is to TransDecoder directory
#IPS_PATH is to InterproScan directory
#SWISS_DB points to the root name of a blast formatted database made from swissprot.
#   If the files are /home/data/swissprot/sw.psq (etc.) then
#   SWISS_DB would be: /home/data/swissprot/sw
#CDHIT_PATH is to cdhit directory 

#Interproscan output formats. other supported options: "tsv,gff3" or "gff3".
#  Other IPS supported formats may be included but they will not be automatically
#  converted to a reduced set. For these additional formats (other than tsv and gff3) 
#  manually run IPS a second time on the "best" subset  using as input the 
#  Transcriptome_reduced.fasta after that is produced. 

IPS_FORMATS=tsv   
NCPUS=40
CDH_BAND="-b 180"  

#  It helps to increase the cd-hit band_width parameter.  When the default of 20
#  is used two sequences that differ by an internal exon of 50aa (as an indel)
#  will NOT merge.  The larger value is slower (increases the width of the alignment
#  band) but merges many of these cases.

nice perl $TRINITY_PATH/Trinity \
  --seqType fq  --max_memory 50G\
  --left  left.fastq \
  --right right.fastq \
  --CPU $NCPUS   --output 283_Trinity  --full_cleanup \
  </dev/null >283_Trinity.log 2>&1 &

mkdir ../TransDecoder
cp 283_Trinity/Trinity.fasta ../TransDecoder/Transcriptome.fasta
cd ../TransDecoder

#first pass, simple run of TransDecoder with blast and hmmsearch

nice $TRANSDECODER_PATH/TransDecoder.LongOrfs -t Transcriptome.fasta \
   >pass1.log 2>&1 &

# forcing hmmscan to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice many_hmmscan_1cpu.sh \
  Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
  hmm_results.txt \
  $NCPUS ""  >pass_2.log 2>&1 &

# forcing blastp to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice ~/bin/many_blast_plus_1cpu.sh \
  blastp \
  Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
  blastp.outfmt6 \
  $NCPUS \
  " -db $SWISS_DB -outfmt 6 -evalue 1e-5 " \
  >pass_3.log 2>&1 &

nice ~/src/TransDecoder/TransDecoder.Predict \
  -t Transcriptome.fasta \
  --retain_pfam_hits hmm_results.txt \
  --retain_blastp_hits blastp.outfmt6 \
  >pass_4.log 2>&1 &

mkdir ../IPS
# InterProscan does not like the trailing "*" in the peptide sequence
tr -d '*' < ../TransDecoder/Transcriptome.fasta.transdecoder.pep \
  >../IPS/proteins.fasta
cd ../IPS
nice $IPS_PATH/interproscan.sh -appl PfamA -iprlookup \
  -goterms -f $IPS_FORMATS -dp -i proteins.fasta \
  >interproscan.log 2>&1 &

cd ../TransDecoder
#run cd-hit on each Trinity transcript family's proteins separately
nice trinity_cdhit_feeder.pl \
  -infile Transcriptome.fasta.transdecoder.pep \
  -output Transcriptome.fasta.transdecoder.pep_pass1 \
  -tmp_dir /tmp/tmpfeed -max_children 40 -params "$CDH_BAND" >cdhit_feeder.log 2>&1 &

#run cd-hit on the results of the preceding.
# ( Doing two cd-hit runs this way eliminated
# in the data set where this was developed
# a small number (6) of partial protein sequences which
# were present when cd-hit was only run directly on the
# TransDecoder pep file.)
#
$CDHIT_PATH/cd-hit -i Transcriptome.fasta.transdecoder.pep_pass1.fasta \
   -o Transcriptome.fasta.transdecoder.pep_pass2 -d 0 -T 40 $CDH_BAND

#reduce the TransDecoder and IPS results to just those in the cd-hit output set
#pull out the entry names, dropping the .p1, .p2, etc.  We only want a single
#copy of each transcript so only one of these .p* entries should be retained.

extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
  -wl 1000000 -mt -dl '>.' -fmt '[1]' -if '>' -ifonly \
  | uniq >keep_these

#entire transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta \
    >Transcriptome_reduced.fasta 2>subset.log &
cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.gff3 1 \
    >Transcriptome_reduced.transdecoder.gff3
#  match_many.pl will emit status messages something like:
#  keys read: 24425
#  records read: 61325, emitted: 26575; keys matched: 14480

cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.bed  1 \
   >Transcriptome_reduced.transdecoder.bed
cp  Transcriptome.fasta.transdecoder.pep_pass2 Transcriptome_reduced.transdecoder.pep

#pull out the entry names, RETAIN .p1, .p2, etc

extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
  -wl 1000000 -mt -dl '> ' -fmt '[1]' -if '>' -ifonly >keep_these
#cds region of transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta.transdecoder.cds \
   >Transcriptome_reduced.transdecoder.cds
USE_TSV=`echo $IPS_FORMATS | grep -i tsv`
if [ "$USE_TSV" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.tsv 1 \
   >Transcriptome_reduced.ips.tsv
fi
USE_GFF3=`echo $IPS_FORMATS | grep -i gff3`
if [ "$USE_GFF3" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.gff3 1 \
   >Transcriptome_reduced.ips.gff3
fi


#Files produced:
Transcriptome.fasta          #full Transcriptome transcripts
Transcriptome_reduced.fasta  #reduced Transcriptome transcripts
Transcriptome_reduced.transdecoder.pep    
Transcriptome_reduced.transdecoder.gff3
Transcriptome_reduced.transdecoder.cds
Transcriptome_reduced.transdecoder.bed
Transcriptome_reduced.ips.tsv

Attributions

Contributed by David Mathog, Sept 19, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Trinity best transcript set

Attributions

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally