Skip to content

Trinity best transcript set

mathog edited this page Aug 29, 2019 · 8 revisions

Method for running Trinity, then processing results with TransDecoder and InterProscan, then reducing the sequences to a "best" set using cd-hit and some scripts.

#external software  references:
#   https://sourceforge.net/projects/lemonade-assemble/
#      for a tar.gz including all of the following, or
#      download one at a time:
#      (Next three do: 'grep gcc fasta*.c' for compile commands)
#      http://saf.bio.caltech.edu/pub/software/molbio/fastaselecth.c
#      http://saf.bio.caltech.edu/pub/software/molbio/fastasplitn.c
#      http://saf.bio.caltech.edu/pub/software/molbio/fastaproperties.c
#      (next four, check internal paths to needed programs before running)
#      http://saf.bio.caltech.edu/pub/software/molbio/many_blast_plus_1cpu.sh
#      http://saf.bio.caltech.edu/pub/software/molbio/many_hmmscan_1cpu.sh
#      http://saf.bio.caltech.edu/pub/software/molbio/trinity_cdhit_feeder.pl
#      http://saf.bio.caltech.edu/pub/software/linux_or_unix_tools/match_many.pl
#   (next is for "extract", a small utility used below)
#   https://sourceforge.net/projects/drmtools/
#   (programs from elsewhere, versions used are indicated)
#   NCBI toolbox+ (ncbi-blast-2.7.1+-1.x86_64)
#   Hmmer  3.1b1
#   Trinity 2.8.3
#   TransDecoder v5.3.0
#   InterProscan 5.30-69.0 
#   cdhit-2017_10_18
#   jellyfish 2.2.6
#   bowtie2 2.3.4.2
#path includes jellyfish-2.2.6 and bowtie2 2.3.4.2
#TRINITY_PATH is to Trinity directory
#TRANSDECODER_PATH is to TransDecoder directory
#IPS_PATH is to InterproScan directory
#SWISS_DB points to the root name of a blast formatted database made from swissprot.
#   If the files are /home/data/swissprot/sw.psq (etc.) then
#   SWISS_DB would be: /home/data/swissprot/sw
#CDHIT_PATH is to cdhit directory 

#Interproscan output formats. other supported options: "tsv,gff3" or "gff3".
#  Other IPS supported formats may be included but they will not be automatically
#  converted to a reduced set. For these additional formats (other than tsv and gff3) 
#  manually run IPS a second time on the "best" subset  using as input the 
#  Transcriptome_reduced.fasta after that is produced. 

IPS_FORMATS=tsv   
NCPUS=40
CDH_BAND="-b 180"  

#  It helps to increase the cd-hit band_width parameter.  When the default of 20
#  is used two sequences that differ by an internal exon of 50aa (as an indel)
#  will NOT merge.  The larger value is slower (increases the width of the alignment
#  band) but merges many of these cases.

nice perl $TRINITY_PATH/Trinity \
  --seqType fq  --max_memory 50G\
  --left  left.fastq \
  --right right.fastq \
  --CPU $NCPUS   --output 283_Trinity  --full_cleanup \
  </dev/null >283_Trinity.log 2>&1 &

mkdir ../TransDecoder
cp 283_Trinity/Trinity.fasta ../TransDecoder/Transcriptome.fasta
cd ../TransDecoder

#first pass, simple run of TransDecoder with blast and hmmsearch

nice $TRANSDECODER_PATH/TransDecoder.LongOrfs -t Transcriptome.fasta \
   >pass1.log 2>&1 &

# forcing hmmscan to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice many_hmmscan_1cpu.sh \
  Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
  hmm_results.txt \
  $NCPUS ""  >pass_2.log 2>&1 &

# forcing blastp to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice ~/bin/many_blast_plus_1cpu.sh \
  blastp \
  Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
  blastp.outfmt6 \
  $NCPUS \
  " -db $SWISS_DB -outfmt 6 -evalue 1e-5 " \
  >pass_3.log 2>&1 &

nice ~/src/TransDecoder/TransDecoder.Predict \
  -t Transcriptome.fasta \
  --retain_pfam_hits hmm_results.txt \
  --retain_blastp_hits blastp.outfmt6 \
  >pass_4.log 2>&1 &

mkdir ../IPS
# InterProscan does not like the trailing "*" in the peptide sequence
tr -d '*' < ../TransDecoder/Transcriptome.fasta.transdecoder.pep \
  >../IPS/proteins.fasta
cd ../IPS
nice $IPS_PATH/interproscan.sh -appl PfamA -iprlookup \
  -goterms -f $IPS_FORMATS -dp -i proteins.fasta \
  >interproscan.log 2>&1 &

cd ../TransDecoder
#run cd-hit on each Trinity transcript family's proteins separately
nice trinity_cdhit_feeder.pl \
  -infile Transcriptome.fasta.transdecoder.pep \
  -output Transcriptome.fasta.transdecoder.pep_pass1 \
  -tmp_dir /tmp/tmpfeed -max_children 40 -params "$CDH_BAND" >cdhit_feeder.log 2>&1 &

#run cd-hit on the results of the preceding.
# ( Doing two cd-hit runs this way eliminated
# in the data set where this was developed
# a small number (6) of partial protein sequences which
# were present when cd-hit was only run directly on the
# TransDecoder pep file.)
#
$CDHIT_PATH/cd-hit -i Transcriptome.fasta.transdecoder.pep_pass1.fasta \
   -o Transcriptome.fasta.transdecoder.pep_pass2 -d 0 -T 40 $CDH_BAND

#reduce the TransDecoder and IPS results to just those in the cd-hit output set
#pull out the entry names, dropping the .p1, .p2, etc.  We only want a single
#copy of each transcript so only one of these .p* entries should be retained.

extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
  -wl 1000000 -mt -dl '>.' -fmt '[1]' -if '>' -ifonly \
  | uniq >keep_these

#entire transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta \
    >Transcriptome_reduced.fasta 2>subset.log &
cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.gff3 1 \
    >Transcriptome_reduced.transdecoder.gff3
#  match_many.pl will emit status messages something like:
#  keys read: 24425
#  records read: 61325, emitted: 26575; keys matched: 14480

cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.bed  1 \
   >Transcriptome_reduced.transdecoder.bed
cp  Transcriptome.fasta.transdecoder.pep_pass2 Transcriptome_reduced.transdecoder.pep

#pull out the entry names, RETAIN .p1, .p2, etc

extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
  -wl 1000000 -mt -dl '> ' -fmt '[1]' -if '>' -ifonly >keep_these
#cds region of transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta.transdecoder.cds \
   >Transcriptome_reduced.transdecoder.cds
USE_TSV=`echo $IPS_FORMATS | grep -i tsv`
if [ "$USE_TSV" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.tsv 1 \
   >Transcriptome_reduced.ips.tsv
fi
USE_GFF3=`echo $IPS_FORMATS | grep -i gff3`
if [ "$USE_GFF3" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.gff3 1 \
   >Transcriptome_reduced.ips.gff3
fi


#Files produced:
Transcriptome.fasta          #full Transcriptome transcripts
Transcriptome_reduced.fasta  #reduced Transcriptome transcripts
Transcriptome_reduced.transdecoder.pep    
Transcriptome_reduced.transdecoder.gff3
Transcriptome_reduced.transdecoder.cds
Transcriptome_reduced.transdecoder.bed
Transcriptome_reduced.ips.tsv

Attributions

Contributed by David Mathog, Sept 19, 2018

Clone this wiki locally