-
Notifications
You must be signed in to change notification settings - Fork 1
Trinity best transcript set
mathog edited this page Aug 29, 2019
·
8 revisions
Method for running Trinity, then processing results with TransDecoder and InterProscan, then reducing the sequences to a "best" set using cd-hit and some scripts.
#external software references:
# https://sourceforge.net/projects/lemonade-assemble/
# for a tar.gz including all of the following, or
# download one at a time:
# (Next three do: 'grep gcc fasta*.c' for compile commands)
# http://saf.bio.caltech.edu/pub/software/molbio/fastaselecth.c
# http://saf.bio.caltech.edu/pub/software/molbio/fastasplitn.c
# http://saf.bio.caltech.edu/pub/software/molbio/fastaproperties.c
# (next four, check internal paths to needed programs before running)
# http://saf.bio.caltech.edu/pub/software/molbio/many_blast_plus_1cpu.sh
# http://saf.bio.caltech.edu/pub/software/molbio/many_hmmscan_1cpu.sh
# http://saf.bio.caltech.edu/pub/software/molbio/trinity_cdhit_feeder.pl
# http://saf.bio.caltech.edu/pub/software/linux_or_unix_tools/match_many.pl
# (next is for "extract", a small utility used below)
# https://sourceforge.net/projects/drmtools/
# (programs from elsewhere, versions used are indicated)
# NCBI toolbox+ (ncbi-blast-2.7.1+-1.x86_64)
# Hmmer 3.1b1
# Trinity 2.8.3
# TransDecoder v5.3.0
# InterProscan 5.30-69.0
# cdhit-2017_10_18
# jellyfish 2.2.6
# bowtie2 2.3.4.2
#path includes jellyfish-2.2.6 and bowtie2 2.3.4.2
#TRINITY_PATH is to Trinity directory
#TRANSDECODER_PATH is to TransDecoder directory
#IPS_PATH is to InterproScan directory
#SWISS_DB points to the root name of a blast formatted database made from swissprot.
# If the files are /home/data/swissprot/sw.psq (etc.) then
# SWISS_DB would be: /home/data/swissprot/sw
#CDHIT_PATH is to cdhit directory
#Interproscan output formats. other supported options: "tsv,gff3" or "gff3".
# Other IPS supported formats may be included but they will not be automatically
# converted to a reduced set. For these additional formats (other than tsv and gff3)
# manually run IPS a second time on the "best" subset using as input the
# Transcriptome_reduced.fasta after that is produced.
IPS_FORMATS=tsv
NCPUS=40
CDH_BAND="-b 180"
# It helps to increase the cd-hit band_width parameter. When the default of 20
# is used two sequences that differ by an internal exon of 50aa (as an indel)
# will NOT merge. The larger value is slower (increases the width of the alignment
# band) but merges many of these cases.
nice perl $TRINITY_PATH/Trinity \
--seqType fq --max_memory 50G\
--left left.fastq \
--right right.fastq \
--CPU $NCPUS --output 283_Trinity --full_cleanup \
</dev/null >283_Trinity.log 2>&1 &
mkdir ../TransDecoder
cp 283_Trinity/Trinity.fasta ../TransDecoder/Transcriptome.fasta
cd ../TransDecoder
#first pass, simple run of TransDecoder with blast and hmmsearch
nice $TRANSDECODER_PATH/TransDecoder.LongOrfs -t Transcriptome.fasta \
>pass1.log 2>&1 &
# forcing hmmscan to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice many_hmmscan_1cpu.sh \
Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
hmm_results.txt \
$NCPUS "" >pass_2.log 2>&1 &
# forcing blastp to run as N processes of 1 thread is
# generally much faster than as 1 process of N threads.
nice ~/bin/many_blast_plus_1cpu.sh \
blastp \
Transcriptome.fasta.transdecoder_dir/longest_orfs.pep \
blastp.outfmt6 \
$NCPUS \
" -db $SWISS_DB -outfmt 6 -evalue 1e-5 " \
>pass_3.log 2>&1 &
nice ~/src/TransDecoder/TransDecoder.Predict \
-t Transcriptome.fasta \
--retain_pfam_hits hmm_results.txt \
--retain_blastp_hits blastp.outfmt6 \
>pass_4.log 2>&1 &
mkdir ../IPS
# InterProscan does not like the trailing "*" in the peptide sequence
tr -d '*' < ../TransDecoder/Transcriptome.fasta.transdecoder.pep \
>../IPS/proteins.fasta
cd ../IPS
nice $IPS_PATH/interproscan.sh -appl PfamA -iprlookup \
-goterms -f $IPS_FORMATS -dp -i proteins.fasta \
>interproscan.log 2>&1 &
cd ../TransDecoder
#run cd-hit on each Trinity transcript family's proteins separately
nice trinity_cdhit_feeder.pl \
-infile Transcriptome.fasta.transdecoder.pep \
-output Transcriptome.fasta.transdecoder.pep_pass1 \
-tmp_dir /tmp/tmpfeed -max_children 40 -params "$CDH_BAND" >cdhit_feeder.log 2>&1 &
#run cd-hit on the results of the preceding.
# ( Doing two cd-hit runs this way eliminated
# in the data set where this was developed
# a small number (6) of partial protein sequences which
# were present when cd-hit was only run directly on the
# TransDecoder pep file.)
#
$CDHIT_PATH/cd-hit -i Transcriptome.fasta.transdecoder.pep_pass1.fasta \
-o Transcriptome.fasta.transdecoder.pep_pass2 -d 0 -T 40 $CDH_BAND
#reduce the TransDecoder and IPS results to just those in the cd-hit output set
#pull out the entry names, dropping the .p1, .p2, etc. We only want a single
#copy of each transcript so only one of these .p* entries should be retained.
extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
-wl 1000000 -mt -dl '>.' -fmt '[1]' -if '>' -ifonly \
| uniq >keep_these
#entire transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta \
>Transcriptome_reduced.fasta 2>subset.log &
cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.gff3 1 \
>Transcriptome_reduced.transdecoder.gff3
# match_many.pl will emit status messages something like:
# keys read: 24425
# records read: 61325, emitted: 26575; keys matched: 14480
cat keep_these | match_many.pl Transcriptome.fasta.transdecoder.bed 1 \
>Transcriptome_reduced.transdecoder.bed
cp Transcriptome.fasta.transdecoder.pep_pass2 Transcriptome_reduced.transdecoder.pep
#pull out the entry names, RETAIN .p1, .p2, etc
extract -in Transcriptome.fasta.transdecoder.pep_pass2 \
-wl 1000000 -mt -dl '> ' -fmt '[1]' -if '>' -ifonly >keep_these
#cds region of transcript for selected protein
cat keep_these | fastaselecth -sel - -in Transcriptome.fasta.transdecoder.cds \
>Transcriptome_reduced.transdecoder.cds
USE_TSV=`echo $IPS_FORMATS | grep -i tsv`
if [ "$USE_TSV" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.tsv 1 \
>Transcriptome_reduced.ips.tsv
fi
USE_GFF3=`echo $IPS_FORMATS | grep -i gff3`
if [ "$USE_GFF3" ]
then
cat keep_these | match_many.pl ../IPS/Transcriptome/proteins.fasta.gff3 1 \
>Transcriptome_reduced.ips.gff3
fi
#Files produced:
Transcriptome.fasta #full Transcriptome transcripts
Transcriptome_reduced.fasta #reduced Transcriptome transcripts
Transcriptome_reduced.transdecoder.pep
Transcriptome_reduced.transdecoder.gff3
Transcriptome_reduced.transdecoder.cds
Transcriptome_reduced.transdecoder.bed
Transcriptome_reduced.ips.tsv
Contributed by David Mathog, Sept 19, 2018